diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86.td | 50 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 59 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrCompiler.td | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.td | 11 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 5 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/cmpxchg16b.ll | 13 | 
8 files changed, 107 insertions, 48 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 4ccb43fe18c..e3454b716ad 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -68,6 +68,9 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",  def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",                                        "Support 64-bit instructions",                                        [FeatureCMOV]>; +def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true", +                                      "64-bit with cmpxchg16b", +                                      [Feature64Bit]>;  def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",                                         "Bit testing of memory is slow">;  def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", @@ -112,26 +115,31 @@ def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;  def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;  def : Proc<"pentium4",        [FeatureSSE2]>;  def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"x86-64",          [FeatureSSE2,   Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;  def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;  def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona",          [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"core2",           [FeatureSSSE3,  Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"penryn",          [FeatureSSE41,  Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"atom",            [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"nocona",          [FeatureSSE3, FeatureCMPXCHG16B, +                               FeatureSlowBTMem]>; +def : Proc<"core2",           [FeatureSSSE3, FeatureCMPXCHG16B, +                               FeatureSlowBTMem]>; +def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B, +                               FeatureSlowBTMem]>; +def : Proc<"atom",            [FeatureSSE3,  FeatureCMPXCHG16B, +                               FeatureSlowBTMem]>;  // "Arrandale" along with corei3 and corei5 -def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem, -                               FeatureFastUAMem, FeatureAES]>; -def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem, -                               FeatureFastUAMem]>; +def : Proc<"corei7",          [FeatureSSE42, FeatureCMPXCHG16B, +                               FeatureSlowBTMem, FeatureFastUAMem, FeatureAES]>; +def : Proc<"nehalem",         [FeatureSSE42,  FeatureCMPXCHG16B, +                               FeatureSlowBTMem, FeatureFastUAMem]>;  // Westmere is a similar machine to nehalem with some additional features.  // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : Proc<"westmere",        [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem, -                               FeatureFastUAMem, FeatureAES, FeatureCLMUL]>; +def : Proc<"westmere",        [FeatureSSE42, FeatureCMPXCHG16B, +                               FeatureSlowBTMem, FeatureFastUAMem, FeatureAES, +                               FeatureCLMUL]>;  // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,  // rather than a superset.  // FIXME: Disabling AVX for now since it's not ready. -def : Proc<"corei7-avx",      [FeatureSSE42, Feature64Bit, +def : Proc<"corei7-avx",      [FeatureSSE42, FeatureCMPXCHG16B,                                 FeatureAES, FeatureCLMUL]>;  def : Proc<"k6",              [FeatureMMX]>; @@ -150,19 +158,21 @@ def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,                                 FeatureSlowBTMem]>;  def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,                                 FeatureSlowBTMem]>; -def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, Feature64Bit, +def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,                                 FeatureSlowBTMem]>; -def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, Feature64Bit, +def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,                                 FeatureSlowBTMem]>; -def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, Feature64Bit, +def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,                                 FeatureSlowBTMem]>;  def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A, -                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>; +                               Feature3DNowA, FeatureCMPXCHG16B, +                               FeatureSlowBTMem]>;  def : Proc<"barcelona",       [FeatureSSE3,   FeatureSSE4A, -                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"istanbul",        [Feature3DNowA, Feature64Bit, FeatureSSE4A, -                               Feature3DNowA]>; -def : Proc<"shanghai",        [Feature3DNowA, Feature64Bit, FeatureSSE4A, +                               Feature3DNowA, FeatureCMPXCHG16B, +                               FeatureSlowBTMem]>; +def : Proc<"istanbul",        [Feature3DNowA, FeatureCMPXCHG16B, +                               FeatureSSE4A, Feature3DNowA]>; +def : Proc<"shanghai",        [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A,                                 Feature3DNowA]>;  def : Proc<"winchip-c6",      [FeatureMMX]>; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 89aaff53cfa..98181b580ec 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -478,6 +478,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)      setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);    } +  if (Subtarget->hasCmpxchg16b()) { +    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); +  } +    // FIXME - use subtarget debug flags    if (!Subtarget->isTargetDarwin() &&        !Subtarget->isTargetELF() && @@ -10421,37 +10425,48 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,    }    case ISD::ATOMIC_CMP_SWAP: {      EVT T = N->getValueType(0); -    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); +    assert (T == MVT::i64 || T == MVT::i128 && "can only expand cmpxchg pair"); +    bool Regs64bit = T == MVT::i128; +    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;      SDValue cpInL, cpInH; -    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), -                        DAG.getConstant(0, MVT::i32)); -    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), -                        DAG.getConstant(1, MVT::i32)); -    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); -    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, -                             cpInL.getValue(1)); +    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), +                        DAG.getConstant(0, HalfT)); +    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), +                        DAG.getConstant(1, HalfT)); +    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, +                             Regs64bit ? X86::RAX : X86::EAX, +                             cpInL, SDValue()); +    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, +                             Regs64bit ? X86::RDX : X86::EDX, +                             cpInH, cpInL.getValue(1));      SDValue swapInL, swapInH; -    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), -                          DAG.getConstant(0, MVT::i32)); -    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), -                          DAG.getConstant(1, MVT::i32)); -    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, -                               cpInH.getValue(1)); -    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, -                               swapInL.getValue(1)); +    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), +                          DAG.getConstant(0, HalfT)); +    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), +                          DAG.getConstant(1, HalfT)); +    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, +                               Regs64bit ? X86::RBX : X86::EBX, +                               swapInL, cpInH.getValue(1)); +    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, +                               Regs64bit ? X86::RCX : X86::ECX,  +                               swapInH, swapInL.getValue(1));      SDValue Ops[] = { swapInH.getValue(0),                        N->getOperand(1),                        swapInH.getValue(1) };      SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);      MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); -    SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, +    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : +                                  X86ISD::LCMPXCHG8_DAG; +    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,                                               Ops, 3, T, MMO); -    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, -                                        MVT::i32, Result.getValue(1)); -    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, -                                        MVT::i32, cpOutL.getValue(2)); +    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, +                                        Regs64bit ? X86::RAX : X86::EAX, +                                        HalfT, Result.getValue(1)); +    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, +                                        Regs64bit ? X86::RDX : X86::EDX, +                                        HalfT, cpOutL.getValue(2));      SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; -    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); +    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));      Results.push_back(cpOutH.getValue(1));      return;    } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index c13d0c762cc..6419879529a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -303,9 +303,10 @@ namespace llvm {        ATOMNAND64_DAG,        ATOMSWAP64_DAG, -      // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap. +      // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.        LCMPXCHG_DAG,        LCMPXCHG8_DAG, +      LCMPXCHG16_DAG,        // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.        VZEXT_LOAD, diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 2801c9bb3f0..829ea279201 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -665,12 +665,20 @@ def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),  // Atomic compare and swap.  let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], -    isCodeGenOnly = 1 in { +    isCodeGenOnly = 1 in  def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),                 "lock\n\t"                 "cmpxchg8b\t$ptr",                 [(X86cas8 addr:$ptr)]>, TB, LOCK; -} + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], +    isCodeGenOnly = 1 in +def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), +                    "lock\n\t" +                    "cmpxchg16b\t$ptr", +                    [(X86cas16 addr:$ptr)]>, TB, LOCK, +                    Requires<[HasCmpxchg16b]>; +  let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {  def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),                 "lock\n\t" diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index a355521e5b9..a09edd4747a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -65,7 +65,7 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2,  def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,                                       SDTCisVT<2, i8>]>; -def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;  def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,                                  SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>; @@ -133,9 +133,13 @@ def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;  def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,                           SDNPMayLoad, SDNPMemOperand]>; -def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8, +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,                           SDNPMayLoad, SDNPMemOperand]>; +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, +                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, +                         SDNPMayLoad, SDNPMemOperand]>; +  def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,                          [SDNPHasChain, SDNPMayStore,                           SDNPMayLoad, SDNPMemOperand]>; @@ -466,6 +470,7 @@ def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;  def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;  def FPStackf32   : Predicate<"!Subtarget->hasXMM()">;  def FPStackf64   : Predicate<"!Subtarget->hasXMMInt()">; +def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;  def In32BitMode  : Predicate<"!Subtarget->is64Bit()">,                               AssemblerPredicate<"!Mode64Bit">;  def In64BitMode  : Predicate<"Subtarget->is64Bit()">, @@ -1190,7 +1195,7 @@ def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),  let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in  def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), -                    "cmpxchg16b\t$dst", []>, TB; +                    "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 5e6c659e539..be77d879ea0 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -203,6 +203,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {    HasFMA3  = IsIntel && ((ECX >> 12) & 0x1);  ToggleFeature(X86::FeatureFMA3);    HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); ToggleFeature(X86::FeaturePOPCNT);    HasAES   = IsIntel && ((ECX >> 25) & 0x1);  ToggleFeature(X86::FeatureAES); +  HasCmpxchg16b = ((ECX >> 13) & 0x1); ToggleFeature(X86::FeatureCMPXCHG16B);    if (IsIntel || IsAMD) {      // Determine if bit test memory instructions are slow. @@ -254,6 +255,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,    , IsBTMemSlow(false)    , IsUAMemFast(false)    , HasVectorUAMem(false) +  , HasCmpxchg16b(false)    , stackAlignment(8)    // FIXME: this is a known good value for Yonah. How about others?    , MaxInlineSizeThreshold(128) diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index d5c433f9aa9..c3a6d0f711d 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -100,6 +100,10 @@ protected:    /// operands. This may require setting a feature bit in the processor.    bool HasVectorUAMem; +  /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction; +  /// this is true for most x86-64 chips, but not the first AMD chips. +  bool HasCmpxchg16b; +    /// stackAlignment - The minimum alignment known to hold of the stack frame on    /// entry to the function and which must be maintained by every function.    unsigned stackAlignment; @@ -168,6 +172,7 @@ public:    bool isBTMemSlow() const { return IsBTMemSlow; }    bool isUnalignedMemAccessFast() const { return IsUAMemFast; }    bool hasVectorUAMem() const { return HasVectorUAMem; } +  bool hasCmpxchg16b() const { return HasCmpxchg16b; }    const Triple &getTargetTriple() const { return TargetTriple; } diff --git a/llvm/test/CodeGen/X86/cmpxchg16b.ll b/llvm/test/CodeGen/X86/cmpxchg16b.ll new file mode 100644 index 00000000000..ba1c4ef9e22 --- /dev/null +++ b/llvm/test/CodeGen/X86/cmpxchg16b.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s + +; Basic 128-bit cmpxchg +define void @t1(i128* nocapture %p) nounwind ssp { +entry: +; CHECK movl	$1, %ebx +; CHECK: lock +; CHECK-NEXT: cmpxchg16b +  %r = cmpxchg i128* %p, i128 0, i128 1 seq_cst +  ret void +} + +; FIXME: Handle 128-bit atomicrmw/load atomic/store atomic  | 

