1 files changed, 212 insertions, 30 deletions
diff --git a/llvm/test/CodeGen/X86/atomic_mi.ll b/llvm/test/CodeGen/X86/atomic_mi.ll
index bf430eaea28..4c0439e8231 100644
--- a/llvm/test/CodeGen/X86/atomic_mi.ll
+++ b/llvm/test/CodeGen/X86/atomic_mi.ll
@@ -457,6 +457,188 @@ define void @add_32r_seq_cst(i32* %p, i32 %v) {
   ret void
 }
 
+; ----- SUB -----
+
+define void @sub_8r(i8* %p, i8 %v) {
+; X64-LABEL: sub_8r:
+; X64:       # %bb.0:
+; X64-NEXT:    movb (%rdi), %al
+; X64-NEXT:    subb %sil, %al
+; X64-NEXT:    movb %al, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_8r:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movb (%eax), %cl
+; X32-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X32-NEXT:    movb %cl, (%eax)
+; X32-NEXT:    retl
+  %1 = load atomic i8, i8* %p seq_cst, align 1
+  %2 = sub i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @sub_16r(i16* %p, i16 %v) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: sub_16r:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    subw %si, %ax
+; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_16r:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzwl (%eax), %ecx
+; X32-NEXT:    subw {{[0-9]+}}(%esp), %cx
+; X32-NEXT:    movw %cx, (%eax)
+; X32-NEXT:    retl
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = sub i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @sub_32r(i32* %p, i32 %v) {
+; X64-LABEL: sub_32r:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    movl %eax, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_32r:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    retl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = sub i32 %1, %v
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+; The following is a corner case where the load is subed to itself. The pattern
+; matching should not fold this. We only test with 32-bit sub, but the same
+; applies to other sizes and operations.
+define void @sub_32r_self(i32* %p) {
+; X64-LABEL: sub_32r_self:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl $0, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_32r_self:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl $0, (%eax)
+; X32-NEXT:    retl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = sub i32 %1, %1
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+; The following is a corner case where the load's result is returned. The
+; optimizer isn't allowed to duplicate the load because it's atomic.
+define i32 @sub_32r_ret_load(i32* %p, i32 %v) {
+; X64-LABEL: sub_32r_ret_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    movl %ecx, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_32r_ret_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %eax
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, (%ecx)
+; X32-NEXT:    retl
+; More code here, we just don't want it to load from P.
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = sub i32 %1, %v
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret i32 %1
+}
+
+define void @sub_64r(i64* %p, i64 %v) {
+; X64-LABEL: sub_64r:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_64r:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %ebx, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    xorl %ebx, %ebx
+; X32-NEXT:    lock cmpxchg8b (%esi)
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%esi), %eax
+; X32-NEXT:    movl 4(%esi), %edx
+; X32-NEXT:    .p2align 4, 0x90
+; X32-NEXT:  .LBB23_1: # %atomicrmw.start
+; X32-NEXT:    # =>This Inner Loop Header: Depth=1
+; X32-NEXT:    lock cmpxchg8b (%esi)
+; X32-NEXT:    jne .LBB23_1
+; X32-NEXT:  # %bb.2: # %atomicrmw.end
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;   We do not check X86-32 as it cannot do 'subq'.
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = sub i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @sub_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: sub_32r_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    xchgl %eax, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: sub_32r_seq_cst:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xchgl %ecx, (%eax)
+; X32-NEXT:    retl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = sub i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- AND -----
 
 define void @and_8i(i8* %p) {
@@ -593,11 +775,11 @@ define void @and_64i(i64* %p) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB24_1: # %atomicrmw.start
+; X32-NEXT:  .LBB31_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB24_1
+; X32-NEXT:    jne .LBB31_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -638,10 +820,10 @@ define void @and_64r(i64* %p, i64 %v) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB25_1: # %atomicrmw.start
+; X32-NEXT:  .LBB32_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB25_1
+; X32-NEXT:    jne .LBB32_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -830,10 +1012,10 @@ define void @or_64i(i64* %p) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB34_1: # %atomicrmw.start
+; X32-NEXT:  .LBB41_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB34_1
+; X32-NEXT:    jne .LBB41_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -874,10 +1056,10 @@ define void @or_64r(i64* %p, i64 %v) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB35_1: # %atomicrmw.start
+; X32-NEXT:  .LBB42_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB35_1
+; X32-NEXT:    jne .LBB42_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -1066,10 +1248,10 @@ define void @xor_64i(i64* %p) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB44_1: # %atomicrmw.start
+; X32-NEXT:  .LBB51_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB44_1
+; X32-NEXT:    jne .LBB51_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -1110,10 +1292,10 @@ define void @xor_64r(i64* %p, i64 %v) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB45_1: # %atomicrmw.start
+; X32-NEXT:  .LBB52_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB45_1
+; X32-NEXT:    jne .LBB52_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -1266,10 +1448,10 @@ define void @inc_64(i64* %p) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB51_1: # %atomicrmw.start
+; X32-NEXT:  .LBB58_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB51_1
+; X32-NEXT:    jne .LBB58_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -1413,10 +1595,10 @@ define void @dec_64(i64* %p) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB56_1: # %atomicrmw.start
+; X32-NEXT:  .LBB63_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB56_1
+; X32-NEXT:    jne .LBB63_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -1545,10 +1727,10 @@ define void @not_64(i64* %p) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB61_1: # %atomicrmw.start
+; X32-NEXT:  .LBB68_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB61_1
+; X32-NEXT:    jne .LBB68_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
@@ -1672,11 +1854,11 @@ define void @neg_64(i64* %p) {
 ; X32-NEXT:    movl (%edi), %eax
 ; X32-NEXT:    movl 4(%edi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB66_1: # %atomicrmw.start
+; X32-NEXT:  .LBB73_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl %esi, %ecx
 ; X32-NEXT:    lock cmpxchg8b (%edi)
-; X32-NEXT:    jne .LBB66_1
+; X32-NEXT:    jne .LBB73_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 12
@@ -1784,10 +1966,10 @@ define void @fadd_64r(double* %loc, double %val) {
 ; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl 4(%esi), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB69_1: # %atomicrmw.start
+; X32-NEXT:  .LBB76_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB69_1
+; X32-NEXT:    jne .LBB76_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    leal -8(%ebp), %esp
 ; X32-NEXT:    popl %esi
@@ -1874,10 +2056,10 @@ define void @fadd_64g() {
 ; X32-NEXT:    movl glob64+4, %edx
 ; X32-NEXT:    movl glob64, %eax
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB71_1: # %atomicrmw.start
+; X32-NEXT:  .LBB78_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b glob64
-; X32-NEXT:    jne .LBB71_1
+; X32-NEXT:    jne .LBB78_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    leal -4(%ebp), %esp
 ; X32-NEXT:    popl %ebx
@@ -1961,10 +2143,10 @@ define void @fadd_64imm() {
 ; X32-NEXT:    movl -559038737, %eax
 ; X32-NEXT:    movl -559038733, %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB73_1: # %atomicrmw.start
+; X32-NEXT:  .LBB80_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b -559038737
-; X32-NEXT:    jne .LBB73_1
+; X32-NEXT:    jne .LBB80_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    leal -4(%ebp), %esp
 ; X32-NEXT:    popl %ebx
@@ -2048,10 +2230,10 @@ define void @fadd_64stack() {
 ; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB75_1: # %atomicrmw.start
+; X32-NEXT:  .LBB82_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%esp)
-; X32-NEXT:    jne .LBB75_1
+; X32-NEXT:    jne .LBB82_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    leal -4(%ebp), %esp
 ; X32-NEXT:    popl %ebx
@@ -2108,10 +2290,10 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) {
 ; X32-NEXT:    movl (%edi,%esi,8), %eax
 ; X32-NEXT:    movl 4(%edi,%esi,8), %edx
 ; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB76_1: # %atomicrmw.start
+; X32-NEXT:  .LBB83_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    lock cmpxchg8b (%edi,%esi,8)
-; X32-NEXT:    jne .LBB76_1
+; X32-NEXT:    jne .LBB83_1
 ; X32-NEXT:  # %bb.2: # %atomicrmw.end
 ; X32-NEXT:    leal -12(%ebp), %esp
 ; X32-NEXT:    popl %esi