4 files changed, 64 insertions, 78 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f1be359862c..9cb7ed0c64f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39768,6 +39768,19 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
     return Ld->getBasePtr() == St->getBasePtr();
   };
 
+  auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
+    if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
+      return false;
+    if (!Op.hasOneUse())
+      return false;
+    SDNode *User = *Op->use_begin();
+    if (User->getOpcode() != ISD::ATOMIC_STORE)
+      return false;
+    auto *Ld = cast<AtomicSDNode>(Load);
+    auto *St = cast<AtomicSDNode>(User);
+    return Ld->getBasePtr() == St->getBasePtr();
+  };
+
   bool Commute = false;
   switch (Op.getOpcode()) {
   default: return false;
@@ -39802,6 +39815,9 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
         ((Commute && !isa<ConstantSDNode>(N1)) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
       return false;
+    if (IsFoldableAtomicRMW(N0, Op) ||
+        (Commute && IsFoldableAtomicRMW(N1, Op)))
+      return false;
   }
   }
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index f5c3463c57a..3264c5b6930 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -896,8 +896,14 @@ multiclass RELEASE_BINOP_MI<SDNode op> {
         "#BINOP "#NAME#"8mr PSEUDO!",
         [(atomic_store_8 addr:$dst, (op
             (atomic_load_8 addr:$dst), GR8:$src))]>;
-    // NAME#16 is not generated as 16-bit arithmetic instructions are considered
-    // costly and avoided as far as possible by this backend anyway
+    def NAME#16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
+        "#BINOP "#NAME#"16mi PSEUDO!",
+        [(atomic_store_16 addr:$dst, (op
+            (atomic_load_16 addr:$dst), (i16 imm:$src)))]>;
+    def NAME#16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+        "#BINOP "#NAME#"16mr PSEUDO!",
+        [(atomic_store_16 addr:$dst, (op
+            (atomic_load_16 addr:$dst), GR16:$src))]>;
     def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
         "#BINOP "#NAME#"32mi PSEUDO!",
         [(atomic_store_32 addr:$dst, (op
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index a5aaa69b9fa..a31aff94604 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -601,24 +601,32 @@ ReSimplify:
   case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
   case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi);    goto ReSimplify;
   case X86::RELEASE_ADD8mr:    OutMI.setOpcode(X86::ADD8mr);    goto ReSimplify;
+  case X86::RELEASE_ADD16mi:   OutMI.setOpcode(X86::ADD16mi);   goto ReSimplify;
+  case X86::RELEASE_ADD16mr:   OutMI.setOpcode(X86::ADD16mr);   goto ReSimplify;
   case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi);   goto ReSimplify;
   case X86::RELEASE_ADD32mr:   OutMI.setOpcode(X86::ADD32mr);   goto ReSimplify;
   case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
   case X86::RELEASE_ADD64mr:   OutMI.setOpcode(X86::ADD64mr);   goto ReSimplify;
   case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi);    goto ReSimplify;
   case X86::RELEASE_AND8mr:    OutMI.setOpcode(X86::AND8mr);    goto ReSimplify;
+  case X86::RELEASE_AND16mi:   OutMI.setOpcode(X86::AND16mi);   goto ReSimplify;
+  case X86::RELEASE_AND16mr:   OutMI.setOpcode(X86::AND16mr);   goto ReSimplify;
   case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi);   goto ReSimplify;
   case X86::RELEASE_AND32mr:   OutMI.setOpcode(X86::AND32mr);   goto ReSimplify;
   case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
   case X86::RELEASE_AND64mr:   OutMI.setOpcode(X86::AND64mr);   goto ReSimplify;
   case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi);     goto ReSimplify;
   case X86::RELEASE_OR8mr:     OutMI.setOpcode(X86::OR8mr);     goto ReSimplify;
+  case X86::RELEASE_OR16mi:    OutMI.setOpcode(X86::OR16mi);    goto ReSimplify;
+  case X86::RELEASE_OR16mr:    OutMI.setOpcode(X86::OR16mr);    goto ReSimplify;
   case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi);    goto ReSimplify;
   case X86::RELEASE_OR32mr:    OutMI.setOpcode(X86::OR32mr);    goto ReSimplify;
   case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32);  goto ReSimplify;
   case X86::RELEASE_OR64mr:    OutMI.setOpcode(X86::OR64mr);    goto ReSimplify;
   case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi);    goto ReSimplify;
   case X86::RELEASE_XOR8mr:    OutMI.setOpcode(X86::XOR8mr);    goto ReSimplify;
+  case X86::RELEASE_XOR16mi:   OutMI.setOpcode(X86::XOR16mi);   goto ReSimplify;
+  case X86::RELEASE_XOR16mr:   OutMI.setOpcode(X86::XOR16mr);   goto ReSimplify;
   case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi);   goto ReSimplify;
   case X86::RELEASE_XOR32mr:   OutMI.setOpcode(X86::XOR32mr);   goto ReSimplify;
   case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
diff --git a/llvm/test/CodeGen/X86/atomic_mi.ll b/llvm/test/CodeGen/X86/atomic_mi.ll
index 5410e9133b2..03958f0ceab 100644
--- a/llvm/test/CodeGen/X86/atomic_mi.ll
+++ b/llvm/test/CodeGen/X86/atomic_mi.ll
@@ -209,17 +209,13 @@ define void @add_16i(i16* %p) {
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 ; X64-LABEL: add_16i:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    addl $2, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    addw $2, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: add_16i:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    addl $2, %ecx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    addw $2, (%eax)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = add i16 %1, 2
@@ -232,17 +228,14 @@ define void @add_16r(i16* %p, i16 %v) {
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 ; X64-LABEL: add_16r:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    addl %esi, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    addw %si, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: add_16r:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    addw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    addw %ax, (%ecx)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = add i16 %1, %v
@@ -506,17 +499,13 @@ define void @and_16i(i16* %p) {
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 ; X64-LABEL: and_16i:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    andl $2, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    andw $2, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: and_16i:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    andl $2, %ecx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    andw $2, (%eax)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = and i16 %1, 2
@@ -529,17 +518,14 @@ define void @and_16r(i16* %p, i16 %v) {
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 ; X64-LABEL: and_16r:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    andw %si, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: and_16r:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    andw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    andw %ax, (%ecx)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = and i16 %1, %v
@@ -751,17 +737,13 @@ define void @or_8r(i8* %p, i8 %v) {
 define void @or_16i(i16* %p) {
 ; X64-LABEL: or_16i:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    orl $2, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    orw $2, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: or_16i:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    orl $2, %ecx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    orw $2, (%eax)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = or i16 %1, 2
@@ -772,17 +754,14 @@ define void @or_16i(i16* %p) {
 define void @or_16r(i16* %p, i16 %v) {
 ; X64-LABEL: or_16r:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    orw %si, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: or_16r:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    orw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    orw %ax, (%ecx)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = or i16 %1, %v
@@ -994,17 +973,13 @@ define void @xor_8r(i8* %p, i8 %v) {
 define void @xor_16i(i16* %p) {
 ; X64-LABEL: xor_16i:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    xorl $2, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    xorw $2, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: xor_16i:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    xorl $2, %ecx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    xorw $2, (%eax)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = xor i16 %1, 2
@@ -1015,17 +990,14 @@ define void @xor_16i(i16* %p) {
 define void @xor_16r(i16* %p, i16 %v) {
 ; X64-LABEL: xor_16r:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    xorl %esi, %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    xorw %si, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: xor_16r:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    xorw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorw %ax, (%ecx)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = xor i16 %1, %v
@@ -1226,24 +1198,18 @@ define void @inc_16(i16* %p) {
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 ; FAST_INC-LABEL: inc_16:
 ; FAST_INC:       # %bb.0:
-; FAST_INC-NEXT:    movw (%rdi), %ax
-; FAST_INC-NEXT:    incl %eax
-; FAST_INC-NEXT:    movw %ax, (%rdi)
+; FAST_INC-NEXT:    incw (%rdi)
 ; FAST_INC-NEXT:    retq
 ;
 ; X32-LABEL: inc_16:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    incl %ecx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    incw (%eax)
 ; X32-NEXT:    retl
 ;
 ; SLOW_INC-LABEL: inc_16:
 ; SLOW_INC:       # %bb.0:
-; SLOW_INC-NEXT:    movw (%rdi), %ax
-; SLOW_INC-NEXT:    addl $1, %eax
-; SLOW_INC-NEXT:    movw %ax, (%rdi)
+; SLOW_INC-NEXT:    addw $1, (%rdi)
 ; SLOW_INC-NEXT:    retq
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = add i16 %1, 1
@@ -1379,24 +1345,18 @@ define void @dec_16(i16* %p) {
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 ; FAST_INC-LABEL: dec_16:
 ; FAST_INC:       # %bb.0:
-; FAST_INC-NEXT:    movw (%rdi), %ax
-; FAST_INC-NEXT:    decl %eax
-; FAST_INC-NEXT:    movw %ax, (%rdi)
+; FAST_INC-NEXT:    decw (%rdi)
 ; FAST_INC-NEXT:    retq
 ;
 ; X32-LABEL: dec_16:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    decl %ecx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    decw (%eax)
 ; X32-NEXT:    retl
 ;
 ; SLOW_INC-LABEL: dec_16:
 ; SLOW_INC:       # %bb.0:
-; SLOW_INC-NEXT:    movw (%rdi), %ax
-; SLOW_INC-NEXT:    addl $-1, %eax
-; SLOW_INC-NEXT:    movw %ax, (%rdi)
+; SLOW_INC-NEXT:    addw $-1, (%rdi)
 ; SLOW_INC-NEXT:    retq
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = sub i16 %1, 1
@@ -1527,17 +1487,13 @@ define void @not_16(i16* %p) {
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
 ; X64-LABEL: not_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw (%rdi), %ax
-; X64-NEXT:    notl %eax
-; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    notw (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: not_16:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movw (%eax), %cx
-; X32-NEXT:    notl %ecx
-; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    notw (%eax)
 ; X32-NEXT:    retl
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = xor i16 %1, -1