7 files changed, 206 insertions, 15 deletions
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 2af2be621ff..d7d0d3cb561 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -186,6 +186,38 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBBI->eraseFromParent();
     return true;
   }
+  case X86::LCMPXCHG8B_SAVE_EBX:
+  case X86::LCMPXCHG16B_SAVE_RBX: {
+    // Perform the following transformation.
+    // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
+    // =>
+    // [E|R]BX = InArg
+    // actualcmpxchg Addr
+    // [E|R]BX = SaveRbx
+    const MachineOperand &InArg = MBBI->getOperand(6);
+    unsigned SaveRbx = MBBI->getOperand(7).getReg();
+
+    unsigned ActualInArg =
+        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+    // Copy the input argument of the pseudo into the argument of the
+    // actual instruction.
+    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
+                     InArg.isKill());
+    // Create the actual instruction.
+    unsigned ActualOpc =
+        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
+    MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
+    // Copy the operands related to the address.
+    for (unsigned Idx = 1; Idx < 6; ++Idx)
+      NewInstr->addOperand(MBBI->getOperand(Idx));
+    // Finally, restore the value of RBX.
+    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
+                     /*SrcIsKill*/ true);
+
+    // Delete the pseudo.
+    MBBI->eraseFromParent();
+    return true;
+  }
   }
   llvm_unreachable("Previous switch has a fallthrough?");
 }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eac67d1e67d..bbbbf3e6537 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21254,20 +21254,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                           DAG.getConstant(0, dl, HalfT));
     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
                           DAG.getConstant(1, dl, HalfT));
-    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
-                               Regs64bit ? X86::RBX : X86::EBX,
-                               swapInL, cpInH.getValue(1));
-    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
-                               Regs64bit ? X86::RCX : X86::ECX,
-                               swapInH, swapInL.getValue(1));
-    SDValue Ops[] = { swapInH.getValue(0),
-                      N->getOperand(1),
-                      swapInH.getValue(1) };
+    swapInH =
+        DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
+                         swapInH, cpInH.getValue(1));
+    // If the current function needs the base pointer, RBX,
+    // we shouldn't use cmpxchg directly.
+    // Indeed the lowering of that instruction will clobber
+    // that register and since RBX will be a reserved register
+    // the register allocator will not make sure its value will
+    // be properly saved and restored around this live-range.
+    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    SDValue Result;
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    unsigned BasePtr = TRI->getBaseRegister();
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
-    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
-                                  X86ISD::LCMPXCHG8_DAG;
-    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+      // ISel prefers the LCMPXCHG64 variant.
+      // If that assert breaks, that means it is not the case anymore,
+      // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
+      // not just EBX. This is a matter of accepting i64 input for that
+      // pseudo, and restoring into the register of the right wide
+      // in expand pseudo. Everything else should just work.
+      assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
+             "Saving only half of the RBX");
+      unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
+                                  : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
+      SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
+                                           Regs64bit ? X86::RBX : X86::EBX,
+                                           HalfT, swapInH.getValue(1));
+      SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
+                       RBXSave,
+                       /*Glue*/ RBXSave.getValue(2)};
+      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    } else {
+      unsigned Opcode =
+          Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
+      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
+                                 Regs64bit ? X86::RBX : X86::EBX, swapInL,
+                                 swapInH.getValue(1));
+      SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
+                       swapInL.getValue(1)};
+      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    }
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                         Regs64bit ? X86::RAX : X86::EAX,
                                         HalfT, Result.getValue(1));
@@ -21422,6 +21451,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
+  case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
+    return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
+  case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
+    return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
   case X86ISD::LADD:               return "X86ISD::LADD";
   case X86ISD::LSUB:               return "X86ISD::LSUB";
   case X86ISD::LOR:                return "X86ISD::LOR";
@@ -23569,6 +23602,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VFMSUBADDPDr213rY:
   case X86::VFMSUBADDPSr213rY:
     return emitFMA3Instr(MI, BB);
+  case X86::LCMPXCHG8B_SAVE_EBX:
+  case X86::LCMPXCHG16B_SAVE_RBX: {
+    unsigned BasePtr =
+        MI->getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+    if (!BB->isLiveIn(BasePtr))
+      BB->addLiveIn(BasePtr);
+    return BB;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 57ba49b5035..94e2f54d6b2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -523,6 +523,8 @@ namespace llvm {
       LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LCMPXCHG8_DAG,
       LCMPXCHG16_DAG,
+      LCMPXCHG8_SAVE_EBX_DAG,
+      LCMPXCHG16_SAVE_RBX_DAG,
 
       /// LOCK-prefixed arithmetic read-modify-write instructions.
       /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 6ed62a00451..2c92cc370b5 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -737,6 +737,38 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
                                 IIC_CMPX_LOCK_8B>;
 }
 
+// This pseudo must be used when the frame uses RBX as
+// the base pointer. Indeed, in such situation RBX is a reserved
+// register and the register allocator will ignore any use/def of
+// it. In other words, the register will not fix the clobbering of
+// RBX that will happen when setting the arguments for the instrucion.
+// 
+// Unlike the actual related instuction, we mark that this one
+// defines EBX (instead of using EBX).
+// The rationale is that we will define RBX during the expansion of
+// the pseudo. The argument feeding EBX is ebx_input.
+//
+// The additional argument, $ebx_save, is a temporary register used to
+// save the value of RBX accross the actual instruction.
+//
+// To make sure the register assigned to $ebx_save does not interfere with
+// the definition of the actual instruction, we use a definition $dst which
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross
+// the instruction and we are sure we will have a valid register to restore
+// the value of RBX.
+let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
+    SchedRW = [WriteALULd, WriteRMW], isCodeGenOnly = 1, isPseudo = 1,
+    Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in {
+def LCMPXCHG8B_SAVE_EBX :
+    I<0, Pseudo, (outs GR32:$dst),
+      (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
+      !strconcat("cmpxchg8b", "\t$ptr"),
+      [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
+                                        GR32:$ebx_save))],
+      IIC_CMPX_LOCK_8B>;
+}
+
+
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
     Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
 defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
@@ -744,6 +776,20 @@ defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
                                  IIC_CMPX_LOCK_16B>, REX_W;
 }
 
+// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW],
+    isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
+    usesCustomInserter = 1 in {
+def LCMPXCHG16B_SAVE_RBX :
+    I<0, Pseudo, (outs GR64:$dst),
+      (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
+      !strconcat("cmpxchg16b", "\t$ptr"),
+      [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
+                                                    GR64:$rbx_save))],
+      IIC_CMPX_LOCK_16B>;
+}
+
 defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
                                X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index aaf87a7b42a..873a44cb499 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -71,6 +71,12 @@ def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
 def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
                                      SDTCisVT<2, i8>]>;
 def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
+                                          [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+                                          SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
+                                           [SDTCisVT<0, i64>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
 
 def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                                        SDTCisPtrTy<1>,
@@ -155,6 +161,14 @@ def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
 def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
+                                SDTX86caspairSaveEbx8,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
+                                SDTX86caspairSaveRbx16,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
 def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
                         [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll
index c41269b0b60..1bf7bfbfa26 100644
--- a/llvm/test/CodeGen/X86/atomic128.ll
+++ b/llvm/test/CodeGen/X86/atomic128.ll
@@ -4,9 +4,14 @@
 
 define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
 ; CHECK-LABEL: val_compare_and_swap:
+; Due to the scheduling right after isel for cmpxchg and given the
+; machine scheduler and copy coalescer do not mess up with physical
+; register live-ranges, we end up with a useless copy.
+;
+; CHECK: movq %rcx, [[TMP:%r[0-9a-z]+]]
 ; CHECK: movq %rsi, %rax
-; CHECK: movq %rcx, %rbx
 ; CHECK: movq %r8, %rcx
+; CHECK: movq [[TMP]], %rbx
 ; CHECK: lock
 ; CHECK: cmpxchg16b (%rdi)
 
@@ -216,8 +221,8 @@ define i128 @atomic_load_seq_cst(i128* %p) {
 ; CHECK-LABEL: atomic_load_seq_cst:
 ; CHECK: xorl %eax, %eax
 ; CHECK: xorl %edx, %edx
-; CHECK: xorl %ebx, %ebx
 ; CHECK: xorl %ecx, %ecx
+; CHECK: xorl %ebx, %ebx
 ; CHECK: lock
 ; CHECK: cmpxchg16b (%rdi)
 
@@ -229,8 +234,8 @@ define i128 @atomic_load_relaxed(i128* %p) {
 ; CHECK: atomic_load_relaxed:
 ; CHECK: xorl %eax, %eax
 ; CHECK: xorl %edx, %edx
-; CHECK: xorl %ebx, %ebx
 ; CHECK: xorl %ecx, %ecx
+; CHECK: xorl %ebx, %ebx
 ; CHECK: lock
 ; CHECK: cmpxchg16b (%rdi)
 
diff --git a/llvm/test/CodeGen/X86/base-pointer-and-cmpxchg.ll b/llvm/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
new file mode 100644
index 00000000000..8de6d64428e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32  %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_64 %s
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32  %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32  %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_32 %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32  %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s
+
+; This function uses dynamic allocated stack to force the use
+; of a frame pointer.
+; The inline asm clobbers a bunch of registers to make sure
+; the frame pointer will need to be used (for spilling in that case).
+;
+; Then, we check that when we use rbx as the base pointer,
+; we do not use cmpxchg, since using that instruction requires
+; to clobbers rbx to set the arguments of the instruction and when
+; rbx is used as the base pointer, RA cannot fix the code for us.
+;
+; CHECK-LABEL: cmp_and_swap16:
+; Check that we actually use rbx.
+; gnux32 use the 32bit variant of the registers.
+; USE_BASE_64: movq %rsp, %rbx
+; USE_BASE_32: movl %esp, %ebx
+;
+; Make sure the base pointer is saved before the RBX argument for
+; cmpxchg16b is set.
+;
+; Because of how the test is written, we spill SAVE_RBX.
+; However, it would have been perfectly fine to just keep it in register.
+; USE_BASE: movq %rbx, [[SAVE_RBX_SLOT:[0-9]*\(%[er]bx\)]]
+;
+; SAVE_RBX must be in register before we clobber rbx.
+; It is fine to use any register but rbx and the ones defined and use
+; by cmpxchg. Since such regex would be complicated to write, just stick
+; to the numbered registers. The bottom line is: if this test case fails
+; because of that regex, this is likely just the regex being too conservative. 
+; USE_BASE: movq [[SAVE_RBX_SLOT]], [[SAVE_RBX:%r[0-9]+]]
+;
+; USE_BASE: movq {{[^ ]+}}, %rbx
+; USE_BASE-NEXT: cmpxchg16b
+; USE_BASE-NEXT: movq [[SAVE_RBX]], %rbx
+;
+; DONT_USE_BASE-NOT: movq %rsp, %rbx
+; DONT_USE_BASE-NOT: movl %esp, %ebx
+; DONT_USE_BASE: cmpxchg
+define i1 @cmp_and_swap16(i128 %a, i128 %b, i128* %addr, i32 %n) {
+  %dummy = alloca i32, i32 %n
+tail call void asm sideeffect "nop", "~{rax},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %cmp = cmpxchg i128* %addr, i128 %a, i128 %b seq_cst seq_cst
+  %res = extractvalue { i128, i1 } %cmp, 1
+  %idx = getelementptr i32, i32* %dummy, i32 5
+  store i32 %n, i32* %idx
+  ret i1 %res
+}