diff options
-rw-r--r-- | llvm/test/CodeGen/X86/atomic_mi.ll | 1532 |
1 files changed, 1228 insertions, 304 deletions
diff --git a/llvm/test/CodeGen/X86/atomic_mi.ll b/llvm/test/CodeGen/X86/atomic_mi.ll index e9f1b59ac58..5f07f54c482 100644 --- a/llvm/test/CodeGen/X86/atomic_mi.ll +++ b/llvm/test/CodeGen/X86/atomic_mi.ll @@ -1,6 +1,7 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64 --check-prefix FAST_INC ; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix X64 --check-prefix SLOW_INC ; This file checks that atomic (non-seq_cst) stores of immediate values are ; done in one mov instruction and not 2. More precisely, it makes sure that the @@ -30,48 +31,83 @@ define void @store_atomic_imm_8(i8* %p) { ; X64-LABEL: store_atomic_imm_8: -; X64: movb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: movb $42, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: store_atomic_imm_8: -; X32: movb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movb $42, (%eax) +; X32-NEXT: retl store atomic i8 42, i8* %p release, align 1 ret void } define void @store_atomic_imm_16(i16* %p) { ; X64-LABEL: store_atomic_imm_16: -; X64: movw -; X64-NOT: movw +; X64: # %bb.0: +; X64-NEXT: movw $42, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: store_atomic_imm_16: -; X32: movw -; X32-NOT: movw +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw $42, (%eax) +; X32-NEXT: retl store atomic i16 42, i16* %p monotonic, align 2 ret void } define void @store_atomic_imm_32(i32* %p) { ; X64-LABEL: store_atomic_imm_32: -; X64: movl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: movl $42, (%rdi) +; X64-NEXT: retq +; +; X32-LABEL: store_atomic_imm_32: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl $42, (%eax) +; X32-NEXT: retl ; On 32 bits, there is an extra movl for each of those functions ; (probably for alignment reasons). -; X32-LABEL: store_atomic_imm_32: -; X32: movl 4(%esp), %eax -; X32: movl -; X32-NOT: movl store atomic i32 42, i32* %p release, align 4 ret void } define void @store_atomic_imm_64(i64* %p) { ; X64-LABEL: store_atomic_imm_64: -; X64: movq -; X64-NOT: movq +; X64: # %bb.0: +; X64-NEXT: movq $42, (%rdi) +; X64-NEXT: retq +; +; X32-LABEL: store_atomic_imm_64: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB3_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: movl $42, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB3_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl ; These are implemented with a CAS loop on 32 bit architectures, and thus ; cannot be optimized in the same way as the others. -; X32-LABEL: store_atomic_imm_64: -; X32: cmpxchg8b store atomic i64 42, i64* %p release, align 8 ret void } @@ -80,8 +116,35 @@ define void @store_atomic_imm_64(i64* %p) { ; even on X64, one must use movabsq that can only target a register. define void @store_atomic_imm_64_big(i64* %p) { ; X64-LABEL: store_atomic_imm_64_big: -; X64: movabsq -; X64: movq +; X64: # %bb.0: +; X64-NEXT: movabsq $100000000000, %rax # imm = 0x174876E800 +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: retq +; +; X32-LABEL: store_atomic_imm_64_big: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB4_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: movl $23, %ecx +; X32-NEXT: movl $1215752192, %ebx # imm = 0x4876E800 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB4_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl store atomic i64 100000000000, i64* %p monotonic, align 8 ret void } @@ -89,9 +152,17 @@ define void @store_atomic_imm_64_big(i64* %p) { ; It would be incorrect to replace a lock xchgl by a movl define void @store_atomic_imm_32_seq_cst(i32* %p) { ; X64-LABEL: store_atomic_imm_32_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl $42, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: store_atomic_imm_32_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl $42, %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl store atomic i32 42, i32* %p seq_cst, align 4 ret void } @@ -100,13 +171,15 @@ define void @store_atomic_imm_32_seq_cst(i32* %p) { define void @add_8i(i8* %p) { ; X64-LABEL: add_8i: -; X64-NOT: lock -; X64: addb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: addb $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_8i: -; X32-NOT: lock -; X32: addb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: addb $2, (%eax) +; X32-NEXT: retl %1 = load atomic i8, i8* %p seq_cst, align 1 %2 = add i8 %1, 2 store atomic i8 %2, i8* %p release, align 1 @@ -115,13 +188,16 @@ define void @add_8i(i8* %p) { define void @add_8r(i8* %p, i8 %v) { ; X64-LABEL: add_8r: -; X64-NOT: lock -; X64: addb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: addb %sil, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_8r: -; X32-NOT: lock -; X32: addb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addb %al, (%ecx) +; X32-NEXT: retl %1 = load atomic i8, i8* %p seq_cst, align 1 %2 = add i8 %1, %v store atomic i8 %2, i8* %p release, align 1 @@ -132,9 +208,19 @@ define void @add_16i(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. ; X64-LABEL: add_16i: -; X64-NOT: addw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: addl $2, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_16i: -; X32-NOT: addw +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: addl $2, %ecx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = add i16 %1, 2 store atomic i16 %2, i16* %p release, align 2 @@ -145,9 +231,19 @@ define void @add_16r(i16* %p, i16 %v) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. ; X64-LABEL: add_16r: -; X64-NOT: addw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: addl %esi, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_16r: -; X32-NOT: addw [.*], ( +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: addw {{[0-9]+}}(%esp), %cx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = add i16 %1, %v store atomic i16 %2, i16* %p release, align 2 @@ -156,13 +252,15 @@ define void @add_16r(i16* %p, i16 %v) { define void @add_32i(i32* %p) { ; X64-LABEL: add_32i: -; X64-NOT: lock -; X64: addl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: addl $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_32i: -; X32-NOT: lock -; X32: addl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: addl $2, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = add i32 %1, 2 store atomic i32 %2, i32* %p monotonic, align 4 @@ -171,13 +269,16 @@ define void @add_32i(i32* %p) { define void @add_32r(i32* %p, i32 %v) { ; X64-LABEL: add_32r: -; X64-NOT: lock -; X64: addl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: addl %esi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_32r: -; X32-NOT: lock -; X32: addl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl %eax, (%ecx) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = add i32 %1, %v store atomic i32 %2, i32* %p monotonic, align 4 @@ -189,15 +290,19 @@ define void @add_32r(i32* %p, i32 %v) { ; applies to other sizes and operations. define void @add_32r_self(i32* %p) { ; X64-LABEL: add_32r_self: -; X64-NOT: lock -; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]] -; X64: addl %[[R]], %[[R]] -; X64: movl %[[R]], (%[[M]]) +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: addl %eax, %eax +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_32r_self: -; X32-NOT: lock -; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]] -; X32: addl %[[R]], %[[R]] -; X32: movl %[[R]], (%[[M]]) +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = add i32 %1, %1 store atomic i32 %2, i32* %p monotonic, align 4 @@ -208,19 +313,21 @@ define void @add_32r_self(i32* %p) { ; optimizer isn't allowed to duplicate the load because it's atomic. define i32 @add_32r_ret_load(i32* %p, i32 %v) { ; X64-LABEL: add_32r_ret_load: -; X64-NOT: lock -; X64: movl (%rdi), %eax -; X64-NEXT: addl %eax, %esi -; X64-NEXT: movl %esi, (%rdi) -; X64-NEXT: retq +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: addl %eax, %esi +; X64-NEXT: movl %esi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_32r_ret_load: -; X32-NOT: lock -; X32: movl 4(%esp), %[[P:[a-z]+]] -; X32-NEXT: movl (%[[P]]), -; X32-NOT: %[[P]] +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: addl %eax, %edx +; X32-NEXT: movl %edx, (%ecx) +; X32-NEXT: retl ; More code here, we just don't want it to load from P. -; X32: movl %{{.*}}, (%[[P]]) -; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = add i32 %1, %v store atomic i32 %2, i32* %p monotonic, align 4 @@ -229,11 +336,42 @@ define i32 @add_32r_ret_load(i32* %p, i32 %v) { define void @add_64i(i64* %p) { ; X64-LABEL: add_64i: -; X64-NOT: lock -; X64: addq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'addq'. +; X64: # %bb.0: +; X64-NEXT: addq $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_64i: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl $2, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB14_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB14_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'addq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = add i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 @@ -242,11 +380,42 @@ define void @add_64i(i64* %p) { define void @add_64r(i64* %p, i64 %v) { ; X64-LABEL: add_64r: -; X64-NOT: lock -; X64: addq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'addq'. +; X64: # %bb.0: +; X64-NEXT: addq %rsi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_64r: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB15_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB15_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'addq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = add i64 %1, %v store atomic i64 %2, i64* %p release, align 8 @@ -255,9 +424,19 @@ define void @add_64r(i64* %p, i64 %v) { define void @add_32i_seq_cst(i32* %p) { ; X64-LABEL: add_32i_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: addl $2, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_32i_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: addl $2, %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = add i32 %1, 2 store atomic i32 %2, i32* %p seq_cst, align 4 @@ -266,9 +445,19 @@ define void @add_32i_seq_cst(i32* %p) { define void @add_32r_seq_cst(i32* %p, i32 %v) { ; X64-LABEL: add_32r_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: addl %esi, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: add_32r_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = add i32 %1, %v store atomic i32 %2, i32* %p seq_cst, align 4 @@ -279,13 +468,15 @@ define void @add_32r_seq_cst(i32* %p, i32 %v) { define void @and_8i(i8* %p) { ; X64-LABEL: and_8i: -; X64-NOT: lock -; X64: andb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: andb $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_8i: -; X32-NOT: lock -; X32: andb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andb $2, (%eax) +; X32-NEXT: retl %1 = load atomic i8, i8* %p monotonic, align 1 %2 = and i8 %1, 2 store atomic i8 %2, i8* %p release, align 1 @@ -294,13 +485,16 @@ define void @and_8i(i8* %p) { define void @and_8r(i8* %p, i8 %v) { ; X64-LABEL: and_8r: -; X64-NOT: lock -; X64: andb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: andb %sil, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_8r: -; X32-NOT: lock -; X32: andb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: andb %al, (%ecx) +; X32-NEXT: retl %1 = load atomic i8, i8* %p monotonic, align 1 %2 = and i8 %1, %v store atomic i8 %2, i8* %p release, align 1 @@ -311,9 +505,19 @@ define void @and_16i(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. ; X64-LABEL: and_16i: -; X64-NOT: andw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: andl $2, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_16i: -; X32-NOT: andw +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: andl $2, %ecx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = and i16 %1, 2 store atomic i16 %2, i16* %p release, align 2 @@ -324,9 +528,19 @@ define void @and_16r(i16* %p, i16 %v) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. ; X64-LABEL: and_16r: -; X64-NOT: andw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: andl %esi, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_16r: -; X32-NOT: andw [.*], ( +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: andw {{[0-9]+}}(%esp), %cx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = and i16 %1, %v store atomic i16 %2, i16* %p release, align 2 @@ -335,13 +549,15 @@ define void @and_16r(i16* %p, i16 %v) { define void @and_32i(i32* %p) { ; X64-LABEL: and_32i: -; X64-NOT: lock -; X64: andl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: andl $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_32i: -; X32-NOT: lock -; X32: andl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andl $2, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = and i32 %1, 2 store atomic i32 %2, i32* %p release, align 4 @@ -350,13 +566,16 @@ define void @and_32i(i32* %p) { define void @and_32r(i32* %p, i32 %v) { ; X64-LABEL: and_32r: -; X64-NOT: lock -; X64: andl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: andl %esi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_32r: -; X32-NOT: lock -; X32: andl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: andl %eax, (%ecx) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = and i32 %1, %v store atomic i32 %2, i32* %p release, align 4 @@ -365,11 +584,41 @@ define void @and_32r(i32* %p, i32 %v) { define void @and_64i(i64* %p) { ; X64-LABEL: and_64i: -; X64-NOT: lock -; X64: andq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'andq'. +; X64: # %bb.0: +; X64-NEXT: andq $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_64i: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: andl $2, %ebx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB24_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB24_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'andq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = and i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 @@ -378,11 +627,42 @@ define void @and_64i(i64* %p) { define void @and_64r(i64* %p, i64 %v) { ; X64-LABEL: and_64r: -; X64-NOT: lock -; X64: andq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'andq'. +; X64: # %bb.0: +; X64-NEXT: andq %rsi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_64r: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: andl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB25_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB25_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'andq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = and i64 %1, %v store atomic i64 %2, i64* %p release, align 8 @@ -391,9 +671,19 @@ define void @and_64r(i64* %p, i64 %v) { define void @and_32i_seq_cst(i32* %p) { ; X64-LABEL: and_32i_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: andl $2, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_32i_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: andl $2, %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = and i32 %1, 2 store atomic i32 %2, i32* %p seq_cst, align 4 @@ -402,9 +692,19 @@ define void @and_32i_seq_cst(i32* %p) { define void @and_32r_seq_cst(i32* %p, i32 %v) { ; X64-LABEL: and_32r_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: andl %esi, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: and_32r_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = and i32 %1, %v store atomic i32 %2, i32* %p seq_cst, align 4 @@ -415,13 +715,15 @@ define void @and_32r_seq_cst(i32* %p, i32 %v) { define void @or_8i(i8* %p) { ; X64-LABEL: or_8i: -; X64-NOT: lock -; X64: orb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: orb $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_8i: -; X32-NOT: lock -; X32: orb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: orb $2, (%eax) +; X32-NEXT: retl %1 = load atomic i8, i8* %p acquire, align 1 %2 = or i8 %1, 2 store atomic i8 %2, i8* %p release, align 1 @@ -430,13 +732,16 @@ define void @or_8i(i8* %p) { define void @or_8r(i8* %p, i8 %v) { ; X64-LABEL: or_8r: -; X64-NOT: lock -; X64: orb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: orb %sil, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_8r: -; X32-NOT: lock -; X32: orb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: orb %al, (%ecx) +; X32-NEXT: retl %1 = load atomic i8, i8* %p acquire, align 1 %2 = or i8 %1, %v store atomic i8 %2, i8* %p release, align 1 @@ -445,9 +750,19 @@ define void @or_8r(i8* %p, i8 %v) { define void @or_16i(i16* %p) { ; X64-LABEL: or_16i: -; X64-NOT: orw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: orl $2, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_16i: -; X32-NOT: orw +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: orl $2, %ecx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = or i16 %1, 2 store atomic i16 %2, i16* %p release, align 2 @@ -456,9 +771,19 @@ define void @or_16i(i16* %p) { define void @or_16r(i16* %p, i16 %v) { ; X64-LABEL: or_16r: -; X64-NOT: orw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: orl %esi, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_16r: -; X32-NOT: orw [.*], ( +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: orw {{[0-9]+}}(%esp), %cx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = or i16 %1, %v store atomic i16 %2, i16* %p release, align 2 @@ -467,13 +792,15 @@ define void @or_16r(i16* %p, i16 %v) { define void @or_32i(i32* %p) { ; X64-LABEL: or_32i: -; X64-NOT: lock -; X64: orl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: orl $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_32i: -; X32-NOT: lock -; X32: orl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: orl $2, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = or i32 %1, 2 store atomic i32 %2, i32* %p release, align 4 @@ -482,13 +809,16 @@ define void @or_32i(i32* %p) { define void @or_32r(i32* %p, i32 %v) { ; X64-LABEL: or_32r: -; X64-NOT: lock -; X64: orl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: orl %esi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_32r: -; X32-NOT: lock -; X32: orl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: orl %eax, (%ecx) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = or i32 %1, %v store atomic i32 %2, i32* %p release, align 4 @@ -497,11 +827,41 @@ define void @or_32r(i32* %p, i32 %v) { define void @or_64i(i64* %p) { ; X64-LABEL: or_64i: -; X64-NOT: lock -; X64: orq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'orq'. +; X64: # %bb.0: +; X64-NEXT: orq $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_64i: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: orl $2, %ebx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB34_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB34_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'orq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = or i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 @@ -510,11 +870,42 @@ define void @or_64i(i64* %p) { define void @or_64r(i64* %p, i64 %v) { ; X64-LABEL: or_64r: -; X64-NOT: lock -; X64: orq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'orq'. +; X64: # %bb.0: +; X64-NEXT: orq %rsi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_64r: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: orl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB35_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB35_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'orq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = or i64 %1, %v store atomic i64 %2, i64* %p release, align 8 @@ -523,9 +914,19 @@ define void @or_64r(i64* %p, i64 %v) { define void @or_32i_seq_cst(i32* %p) { ; X64-LABEL: or_32i_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: orl $2, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_32i_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: orl $2, %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = or i32 %1, 2 store atomic i32 %2, i32* %p seq_cst, align 4 @@ -534,9 +935,19 @@ define void @or_32i_seq_cst(i32* %p) { define void @or_32r_seq_cst(i32* %p, i32 %v) { ; X64-LABEL: or_32r_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: orl %esi, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: or_32r_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = or i32 %1, %v store atomic i32 %2, i32* %p seq_cst, align 4 @@ -547,13 +958,15 @@ define void @or_32r_seq_cst(i32* %p, i32 %v) { define void @xor_8i(i8* %p) { ; X64-LABEL: xor_8i: -; X64-NOT: lock -; X64: xorb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: xorb $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_8i: -; X32-NOT: lock -; X32: xorb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorb $2, (%eax) +; X32-NEXT: retl %1 = load atomic i8, i8* %p acquire, align 1 %2 = xor i8 %1, 2 store atomic i8 %2, i8* %p release, align 1 @@ -562,13 +975,16 @@ define void @xor_8i(i8* %p) { define void @xor_8r(i8* %p, i8 %v) { ; X64-LABEL: xor_8r: -; X64-NOT: lock -; X64: xorb -; X64-NOT: movb +; X64: # %bb.0: +; X64-NEXT: xorb %sil, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_8r: -; X32-NOT: lock -; X32: xorb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xorb %al, (%ecx) +; X32-NEXT: retl %1 = load atomic i8, i8* %p acquire, align 1 %2 = xor i8 %1, %v store atomic i8 %2, i8* %p release, align 1 @@ -577,9 +993,19 @@ define void @xor_8r(i8* %p, i8 %v) { define void @xor_16i(i16* %p) { ; X64-LABEL: xor_16i: -; X64-NOT: xorw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: xorl $2, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_16i: -; X32-NOT: xorw +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: xorl $2, %ecx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = xor i16 %1, 2 store atomic i16 %2, i16* %p release, align 2 @@ -588,9 +1014,19 @@ define void @xor_16i(i16* %p) { define void @xor_16r(i16* %p, i16 %v) { ; X64-LABEL: xor_16r: -; X64-NOT: xorw +; X64: # %bb.0: +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: xorl %esi, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_16r: -; X32-NOT: xorw [.*], ( +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: xorw {{[0-9]+}}(%esp), %cx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl %1 = load atomic i16, i16* %p acquire, align 2 %2 = xor i16 %1, %v store atomic i16 %2, i16* %p release, align 2 @@ -599,13 +1035,15 @@ define void @xor_16r(i16* %p, i16 %v) { define void @xor_32i(i32* %p) { ; X64-LABEL: xor_32i: -; X64-NOT: lock -; X64: xorl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: xorl $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_32i: -; X32-NOT: lock -; X32: xorl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl $2, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = xor i32 %1, 2 store atomic i32 %2, i32* %p release, align 4 @@ -614,13 +1052,16 @@ define void @xor_32i(i32* %p) { define void @xor_32r(i32* %p, i32 %v) { ; X64-LABEL: xor_32r: -; X64-NOT: lock -; X64: xorl -; X64-NOT: movl +; X64: # %bb.0: +; X64-NEXT: xorl %esi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_32r: -; X32-NOT: lock -; X32: xorl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xorl %eax, (%ecx) +; X32-NEXT: retl %1 = load atomic i32, i32* %p acquire, align 4 %2 = xor i32 %1, %v store atomic i32 %2, i32* %p release, align 4 @@ -629,11 +1070,41 @@ define void @xor_32r(i32* %p, i32 %v) { define void @xor_64i(i64* %p) { ; X64-LABEL: xor_64i: -; X64-NOT: lock -; X64: xorq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'xorq'. +; X64: # %bb.0: +; X64-NEXT: xorq $2, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_64i: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: xorl $2, %ebx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB44_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB44_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'xorq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = xor i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 @@ -642,11 +1113,42 @@ define void @xor_64i(i64* %p) { define void @xor_64r(i64* %p, i64 %v) { ; X64-LABEL: xor_64r: -; X64-NOT: lock -; X64: xorq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'xorq'. +; X64: # %bb.0: +; X64-NEXT: xorq %rsi, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_64r: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB45_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB45_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; We do not check X86-32 as it cannot do 'xorq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = xor i64 %1, %v store atomic i64 %2, i64* %p release, align 8 @@ -655,9 +1157,19 @@ define void @xor_64r(i64* %p, i64 %v) { define void @xor_32i_seq_cst(i32* %p) { ; X64-LABEL: xor_32i_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: xorl $2, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_32i_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: xorl $2, %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = xor i32 %1, 2 store atomic i32 %2, i32* %p seq_cst, align 4 @@ -666,9 +1178,19 @@ define void @xor_32i_seq_cst(i32* %p) { define void @xor_32r_seq_cst(i32* %p, i32 %v) { ; X64-LABEL: xor_32r_seq_cst: -; X64: xchgl +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: xorl %esi, %eax +; X64-NEXT: xchgl %eax, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: xor_32r_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = xor i32 %1, %v store atomic i32 %2, i32* %p seq_cst, align 4 @@ -678,17 +1200,21 @@ define void @xor_32r_seq_cst(i32* %p, i32 %v) { ; ----- INC ----- define void @inc_8(i8* %p) { -; X64-LABEL: inc_8: -; X64-NOT: lock -; X64: incb -; X64-NOT: movb +; FAST_INC-LABEL: inc_8: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: incb (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: inc_8: -; X32-NOT: lock -; X32: incb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: incb (%eax) +; X32-NEXT: retl +; ; SLOW_INC-LABEL: inc_8: -; SLOW_INC-NOT: incb -; SLOW_INC-NOT: movb +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: addb $1, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i8, i8* %p seq_cst, align 1 %2 = add i8 %1, 1 store atomic i8 %2, i8* %p release, align 1 @@ -698,12 +1224,27 @@ define void @inc_8(i8* %p) { define void @inc_16(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: inc_16: -; X64-NOT: incw +; FAST_INC-LABEL: inc_16: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: movw (%rdi), %ax +; FAST_INC-NEXT: incl %eax +; FAST_INC-NEXT: movw %ax, (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: inc_16: -; X32-NOT: incw +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: incl %ecx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl +; ; SLOW_INC-LABEL: inc_16: -; SLOW_INC-NOT: incw +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: movw (%rdi), %ax +; SLOW_INC-NEXT: addl $1, %eax +; SLOW_INC-NEXT: movw %ax, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i16, i16* %p acquire, align 2 %2 = add i16 %1, 1 store atomic i16 %2, i16* %p release, align 2 @@ -711,17 +1252,21 @@ define void @inc_16(i16* %p) { } define void @inc_32(i32* %p) { -; X64-LABEL: inc_32: -; X64-NOT: lock -; X64: incl -; X64-NOT: movl +; FAST_INC-LABEL: inc_32: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: incl (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: inc_32: -; X32-NOT: lock -; X32: incl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: incl (%eax) +; X32-NEXT: retl +; ; SLOW_INC-LABEL: inc_32: -; SLOW_INC-NOT: incl -; SLOW_INC-NOT: movl +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: addl $1, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i32, i32* %p acquire, align 4 %2 = add i32 %1, 1 store atomic i32 %2, i32* %p monotonic, align 4 @@ -729,15 +1274,48 @@ define void @inc_32(i32* %p) { } define void @inc_64(i64* %p) { -; X64-LABEL: inc_64: -; X64-NOT: lock -; X64: incq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'incq'. +; FAST_INC-LABEL: inc_64: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: incq (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: inc_64: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl $1, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB51_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB51_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; ; SLOW_INC-LABEL: inc_64: -; SLOW_INC-NOT: incq -; SLOW_INC-NOT: movq +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: addq $1, (%rdi) +; SLOW_INC-NEXT: retq +; We do not check X86-32 as it cannot do 'incq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = add i64 %1, 1 store atomic i64 %2, i64* %p release, align 8 @@ -745,10 +1323,27 @@ define void @inc_64(i64* %p) { } define void @inc_32_seq_cst(i32* %p) { -; X64-LABEL: inc_32_seq_cst: -; X64: xchgl +; FAST_INC-LABEL: inc_32_seq_cst: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: movl (%rdi), %eax +; FAST_INC-NEXT: incl %eax +; FAST_INC-NEXT: xchgl %eax, (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: inc_32_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: incl %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl +; +; SLOW_INC-LABEL: inc_32_seq_cst: +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: movl (%rdi), %eax +; SLOW_INC-NEXT: addl $1, %eax +; SLOW_INC-NEXT: xchgl %eax, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i32, i32* %p monotonic, align 4 %2 = add i32 %1, 1 store atomic i32 %2, i32* %p seq_cst, align 4 @@ -758,17 +1353,21 @@ define void @inc_32_seq_cst(i32* %p) { ; ----- DEC ----- define void @dec_8(i8* %p) { -; X64-LABEL: dec_8: -; X64-NOT: lock -; X64: decb -; X64-NOT: movb +; FAST_INC-LABEL: dec_8: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: decb (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: dec_8: -; X32-NOT: lock -; X32: decb -; X32-NOT: movb +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: decb (%eax) +; X32-NEXT: retl +; ; SLOW_INC-LABEL: dec_8: -; SLOW_INC-NOT: decb -; SLOW_INC-NOT: movb +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: addb $-1, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i8, i8* %p seq_cst, align 1 %2 = sub i8 %1, 1 store atomic i8 %2, i8* %p release, align 1 @@ -778,12 +1377,27 @@ define void @dec_8(i8* %p) { define void @dec_16(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: dec_16: -; X64-NOT: decw +; FAST_INC-LABEL: dec_16: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: movw (%rdi), %ax +; FAST_INC-NEXT: decl %eax +; FAST_INC-NEXT: movw %ax, (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: dec_16: -; X32-NOT: decw +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movw (%eax), %cx +; X32-NEXT: decl %ecx +; X32-NEXT: movw %cx, (%eax) +; X32-NEXT: retl +; ; SLOW_INC-LABEL: dec_16: -; SLOW_INC-NOT: decw +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: movw (%rdi), %ax +; SLOW_INC-NEXT: addl $-1, %eax +; SLOW_INC-NEXT: movw %ax, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i16, i16* %p acquire, align 2 %2 = sub i16 %1, 1 store atomic i16 %2, i16* %p release, align 2 @@ -791,17 +1405,21 @@ define void @dec_16(i16* %p) { } define void @dec_32(i32* %p) { -; X64-LABEL: dec_32: -; X64-NOT: lock -; X64: decl -; X64-NOT: movl +; FAST_INC-LABEL: dec_32: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: decl (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: dec_32: -; X32-NOT: lock -; X32: decl -; X32-NOT: movl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: decl (%eax) +; X32-NEXT: retl +; ; SLOW_INC-LABEL: dec_32: -; SLOW_INC-NOT: decl -; SLOW_INC-NOT: movl +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: addl $-1, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i32, i32* %p acquire, align 4 %2 = sub i32 %1, 1 store atomic i32 %2, i32* %p monotonic, align 4 @@ -809,15 +1427,48 @@ define void @dec_32(i32* %p) { } define void @dec_64(i64* %p) { -; X64-LABEL: dec_64: -; X64-NOT: lock -; X64: decq -; X64-NOT: movq -; We do not check X86-32 as it cannot do 'decq'. +; FAST_INC-LABEL: dec_64: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: decq (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: dec_64: +; X32: # %bb.0: +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl $-1, %ebx +; X32-NEXT: adcl $-1, %ecx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB56_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB56_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; ; SLOW_INC-LABEL: dec_64: -; SLOW_INC-NOT: decq -; SLOW_INC-NOT: movq +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: addq $-1, (%rdi) +; SLOW_INC-NEXT: retq +; We do not check X86-32 as it cannot do 'decq'. %1 = load atomic i64, i64* %p acquire, align 8 %2 = sub i64 %1, 1 store atomic i64 %2, i64* %p release, align 8 @@ -825,10 +1476,27 @@ define void @dec_64(i64* %p) { } define void @dec_32_seq_cst(i32* %p) { -; X64-LABEL: dec_32_seq_cst: -; X64: xchgl +; FAST_INC-LABEL: dec_32_seq_cst: +; FAST_INC: # %bb.0: +; FAST_INC-NEXT: movl (%rdi), %eax +; FAST_INC-NEXT: decl %eax +; FAST_INC-NEXT: xchgl %eax, (%rdi) +; FAST_INC-NEXT: retq +; ; X32-LABEL: dec_32_seq_cst: -; X32: xchgl +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: decl %ecx +; X32-NEXT: xchgl %ecx, (%eax) +; X32-NEXT: retl +; +; SLOW_INC-LABEL: dec_32_seq_cst: +; SLOW_INC: # %bb.0: +; SLOW_INC-NEXT: movl (%rdi), %eax +; SLOW_INC-NEXT: addl $-1, %eax +; SLOW_INC-NEXT: xchgl %eax, (%rdi) +; SLOW_INC-NEXT: retq %1 = load atomic i32, i32* %p monotonic, align 4 %2 = sub i32 %1, 1 store atomic i32 %2, i32* %p seq_cst, align 4 @@ -839,11 +1507,26 @@ define void @dec_32_seq_cst(i32* %p) { define void @fadd_32r(float* %loc, float %val) { ; X64-LABEL: fadd_32r: -; X64-NOT: lock -; X64-NOT: mov -; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]] -; X64-NEXT: movss %[[XMM]], (%[[M]]) +; X64: # %bb.0: +; X64-NEXT: addss (%rdi), %xmm0 +; X64-NEXT: movss %xmm0, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: fadd_32r: +; X32: # %bb.0: +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: fadds {{[0-9]+}}(%esp) +; X32-NEXT: fstps {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl ; Don't check x86-32. ; LLVM's SSE handling is conservative on x86-32 even without using atomics. %floc = bitcast float* %loc to i32* @@ -857,11 +1540,51 @@ define void @fadd_32r(float* %loc, float %val) { define void @fadd_64r(double* %loc, double %val) { ; X64-LABEL: fadd_64r: -; X64-NOT: lock -; X64-NOT: mov -; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]] -; X64-NEXT: movsd %[[XMM]], (%[[M]]) +; X64: # %bb.0: +; X64-NEXT: addsd (%rdi), %xmm0 +; X64-NEXT: movsd %xmm0, (%rdi) +; X64-NEXT: retq +; ; X32-LABEL: fadd_64r: +; X32: # %bb.0: +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $16, %esp +; X32-NEXT: .cfi_offset %esi, -16 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: movl 8(%ebp), %esi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: faddl 12(%ebp) +; X32-NEXT: fstpl (%esp) +; X32-NEXT: movl (%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%esi), %eax +; X32-NEXT: movl 4(%esi), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB59_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esi) +; X32-NEXT: jne .LBB59_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: leal -8(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). %floc = bitcast double* %loc to i64* %1 = load atomic i64, i64* %floc seq_cst, align 8 @@ -878,11 +1601,26 @@ define void @fadd_64r(double* %loc, double %val) { ; Floating-point add to a global using an immediate. define void @fadd_32g() { ; X64-LABEL: fadd_32g: -; X64-NOT: lock -; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] -; X64-NEXT: addss glob32(%rip), %[[XMM]] -; X64-NEXT: movss %[[XMM]], glob32(%rip) +; X64: # %bb.0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: movss %xmm0, {{.*}}(%rip) +; X64-NEXT: retq +; ; X32-LABEL: fadd_32g: +; X32: # %bb.0: +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: movl glob32, %eax +; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: fld1 +; X32-NEXT: fadds (%esp) +; X32-NEXT: fstps {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, glob32 +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4 %f = bitcast i32 %i to float @@ -894,11 +1632,48 @@ define void @fadd_32g() { define void @fadd_64g() { ; X64-LABEL: fadd_64g: -; X64-NOT: lock -; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] -; X64-NEXT: addsd glob64(%rip), %[[XMM]] -; X64-NEXT: movsd %[[XMM]], glob64(%rip) +; X64: # %bb.0: +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: addsd {{.*}}(%rip), %xmm0 +; X64-NEXT: movsd %xmm0, {{.*}}(%rip) +; X64-NEXT: retq +; ; X32-LABEL: fadd_64g: +; X32: # %bb.0: +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $24, %esp +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b glob64 +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: fld1 +; X32-NEXT: faddl {{[0-9]+}}(%esp) +; X32-NEXT: fstpl (%esp) +; X32-NEXT: movl (%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl glob64+4, %edx +; X32-NEXT: movl glob64, %eax +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB61_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b glob64 +; X32-NEXT: jne .LBB61_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8 %f = bitcast i64 %i to double @@ -911,12 +1686,27 @@ define void @fadd_64g() { ; Floating-point add to a hard-coded immediate location using an immediate. define void @fadd_32imm() { ; X64-LABEL: fadd_32imm: -; X64-NOT: lock -; X64: movl $3735928559, %e[[M:[a-z]+]] -; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] -; X64-NEXT: addss (%r[[M]]), %[[XMM]] -; X64-NEXT: movss %[[XMM]], (%r[[M]]) +; X64: # %bb.0: +; X64-NEXT: movl $3735928559, %eax # imm = 0xDEADBEEF +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: addss (%rax), %xmm0 +; X64-NEXT: movss %xmm0, (%rax) +; X64-NEXT: retq +; ; X32-LABEL: fadd_32imm: +; X32: # %bb.0: +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: movl -559038737, %eax +; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: fld1 +; X32-NEXT: fadds (%esp) +; X32-NEXT: fstps {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, -559038737 +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4 %f = bitcast i32 %i to float @@ -928,12 +1718,49 @@ define void @fadd_32imm() { define void @fadd_64imm() { ; X64-LABEL: fadd_64imm: -; X64-NOT: lock -; X64: movl $3735928559, %e[[M:[a-z]+]] -; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] -; X64-NEXT: addsd (%r[[M]]), %[[XMM]] -; X64-NEXT: movsd %[[XMM]], (%r[[M]]) +; X64: # %bb.0: +; X64-NEXT: movl $3735928559, %eax # imm = 0xDEADBEEF +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: addsd (%rax), %xmm0 +; X64-NEXT: movsd %xmm0, (%rax) +; X64-NEXT: retq +; ; X32-LABEL: fadd_64imm: +; X32: # %bb.0: +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $24, %esp +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b -559038737 +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: fld1 +; X32-NEXT: faddl {{[0-9]+}}(%esp) +; X32-NEXT: fstpl (%esp) +; X32-NEXT: movl (%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl -559038737, %eax +; X32-NEXT: movl -559038733, %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB63_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b -559038737 +; X32-NEXT: jne .LBB63_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8 %f = bitcast i64 %i to double @@ -946,11 +1773,26 @@ define void @fadd_64imm() { ; Floating-point add to a stack location. define void @fadd_32stack() { ; X64-LABEL: fadd_32stack: -; X64-NOT: lock -; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] -; X64-NEXT: addss [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]] -; X64-NEXT: movss %[[XMM]], [[STACKOFF]](%rsp) +; X64: # %bb.0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: addss -{{[0-9]+}}(%rsp), %xmm0 +; X64-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq +; ; X32-LABEL: fadd_32stack: +; X32: # %bb.0: +; X32-NEXT: subl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: fld1 +; X32-NEXT: fadds (%esp) +; X32-NEXT: fstps {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). %ptr = alloca i32, align 4 %bc3 = bitcast i32* %ptr to float* @@ -964,11 +1806,48 @@ define void @fadd_32stack() { define void @fadd_64stack() { ; X64-LABEL: fadd_64stack: -; X64-NOT: lock -; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] -; X64-NEXT: addsd [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]] -; X64-NEXT: movsd %[[XMM]], [[STACKOFF]](%rsp) +; X64: # %bb.0: +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: addsd -{{[0-9]+}}(%rsp), %xmm0 +; X64-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq +; ; X32-LABEL: fadd_64stack: +; X32: # %bb.0: +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $32, %esp +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%esp) +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: fld1 +; X32-NEXT: faddl {{[0-9]+}}(%esp) +; X32-NEXT: fstpl {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB65_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%esp) +; X32-NEXT: jne .LBB65_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). %ptr = alloca i64, align 8 %bc3 = bitcast i64* %ptr to double* @@ -982,10 +1861,55 @@ define void @fadd_64stack() { define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) { ; X64-LABEL: fadd_array: -; X64-NOT: lock -; X64: addsd ([[ADDR:%r..,%r..,8]]), %[[XMM:xmm[0-9]+]] -; X64-NEXT: movsd %[[XMM]], ([[ADDR]]) +; X64: # %bb.0: # %bb +; X64-NEXT: addsd (%rdi,%rsi,8), %xmm0 +; X64-NEXT: movsd %xmm0, (%rdi,%rsi,8) +; X64-NEXT: retq +; ; X32-LABEL: fadd_array: +; X32: # %bb.0: # %bb +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $24, %esp +; X32-NEXT: .cfi_offset %esi, -20 +; X32-NEXT: .cfi_offset %edi, -16 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: movl 20(%ebp), %esi +; X32-NEXT: movl 8(%ebp), %edi +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: lock cmpxchg8b (%edi,%esi,8) +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: faddl 12(%ebp) +; X32-NEXT: fstpl (%esp) +; X32-NEXT: movl (%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%edi,%esi,8), %eax +; X32-NEXT: movl 4(%edi,%esi,8), %edx +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB66_1: # %atomicrmw.start +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: lock cmpxchg8b (%edi,%esi,8) +; X32-NEXT: jne .LBB66_1 +; X32-NEXT: # %bb.2: # %atomicrmw.end +; X32-NEXT: leal -12(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: retl ; Don't check x86-32 (see comment above). bb: %tmp4 = getelementptr inbounds i64, i64* %arg, i64 %arg2 |