diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/test/CodeGen/X86/MergeConsecutiveStores.ll | 431 |
1 files changed, 329 insertions, 102 deletions
diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll index 69f5f4c7a05..5058f1f5ec9 100644 --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -1,14 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 } %struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 } -; CHECK-LABEL: merge_const_store: ; save 1,2,3 ... as one big integer. -; CHECK: movabsq $578437695752307201 -; CHECK: ret define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp { +; CHECK-LABEL: merge_const_store: +; CHECK: # BB#0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB0_3 +; CHECK-NEXT: # BB#1: # %.lr.ph.preheader +; CHECK-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_2: # %.lr.ph +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq %rax, (%rsi) +; CHECK-NEXT: addq $8, %rsi +; CHECK-NEXT: decl %edi +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: .LBB0_3: # %._crit_edge +; CHECK-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge .lr.ph: @@ -39,10 +52,23 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt } ; No vectors because we use noimplicitfloat -; CHECK-LABEL: merge_const_store_no_vec: -; CHECK-NOT: vmovups -; CHECK: ret define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{ +; CHECK-LABEL: merge_const_store_no_vec: +; CHECK: # BB#0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB1_2 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB1_1: # %.lr.ph +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rsi) +; CHECK-NEXT: movq $0, 8(%rsi) +; CHECK-NEXT: movq $0, 16(%rsi) +; CHECK-NEXT: movq $0, 24(%rsi) +; CHECK-NEXT: addq $32, %rsi +; CHECK-NEXT: decl %edi +; CHECK-NEXT: jne .LBB1_1 +; CHECK-NEXT: .LBB1_2: # %._crit_edge +; CHECK-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge .lr.ph: @@ -73,10 +99,23 @@ define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimp } ; Move the constants using a single vector store. -; CHECK-LABEL: merge_const_store_vec: -; CHECK: vmovups -; CHECK: ret define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp { +; CHECK-LABEL: merge_const_store_vec: +; CHECK: # BB#0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB2_3 +; CHECK-NEXT: # BB#1: # %.lr.ph.preheader +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB2_2: # %.lr.ph +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: addq $32, %rsi +; CHECK-NEXT: decl %edi +; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: .LBB2_3: # %._crit_edge +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge .lr.ph: @@ -107,13 +146,23 @@ define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind } ; Move the first 4 constants as a single vector. Move the rest as scalars. -; CHECK-LABEL: merge_nonconst_store: -; CHECK: movl $67305985 -; CHECK: movb -; CHECK: movw -; CHECK: movb -; CHECK: ret define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp { +; CHECK-LABEL: merge_nonconst_store: +; CHECK: # BB#0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB3_2 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB3_1: # %.lr.ph +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movl $67305985, (%rdx) # imm = 0x4030201 +; CHECK-NEXT: movb %sil, 4(%rdx) +; CHECK-NEXT: movw $1798, 5(%rdx) # imm = 0x706 +; CHECK-NEXT: movb $8, 7(%rdx) +; CHECK-NEXT: addq $8, %rdx +; CHECK-NEXT: decl %edi +; CHECK-NEXT: jne .LBB3_1 +; CHECK-NEXT: .LBB3_2: # %._crit_edge +; CHECK-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge .lr.ph: @@ -143,15 +192,34 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) n ret void } - -; CHECK-LABEL: merge_loads_i16: -; load: -; BWON: movzwl -; BWOFF: movw -; store: -; CHECK: movw -; CHECK: ret define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { +; BWON-LABEL: merge_loads_i16: +; BWON: # BB#0: +; BWON-NEXT: testl %edi, %edi +; BWON-NEXT: jle .LBB4_2 +; BWON-NEXT: .p2align 4, 0x90 +; BWON-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; BWON-NEXT: movzwl (%rsi), %eax +; BWON-NEXT: movw %ax, (%rdx) +; BWON-NEXT: addq $8, %rdx +; BWON-NEXT: decl %edi +; BWON-NEXT: jne .LBB4_1 +; BWON-NEXT: .LBB4_2: # %._crit_edge +; BWON-NEXT: retq +; +; BWOFF-LABEL: merge_loads_i16: +; BWOFF: # BB#0: +; BWOFF-NEXT: testl %edi, %edi +; BWOFF-NEXT: jle .LBB4_2 +; BWOFF-NEXT: .p2align 4, 0x90 +; BWOFF-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; BWOFF-NEXT: movw (%rsi), %ax +; BWOFF-NEXT: movw %ax, (%rdx) +; BWOFF-NEXT: addq $8, %rdx +; BWOFF-NEXT: decl %edi +; BWOFF-NEXT: jne .LBB4_1 +; BWOFF-NEXT: .LBB4_2: # %._crit_edge +; BWOFF-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -179,15 +247,40 @@ define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struc } ; The loads and the stores are interleaved. Can't merge them. -; CHECK-LABEL: no_merge_loads: -; BWON: movzbl -; BWOFF: movb -; CHECK: movb -; BWON: movzbl -; BWOFF: movb -; CHECK: movb -; CHECK: ret define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { +; BWON-LABEL: no_merge_loads: +; BWON: # BB#0: +; BWON-NEXT: testl %edi, %edi +; BWON-NEXT: jle .LBB5_2 +; BWON-NEXT: .p2align 4, 0x90 +; BWON-NEXT: .LBB5_1: # %a4 +; BWON-NEXT: # =>This Inner Loop Header: Depth=1 +; BWON-NEXT: movzbl (%rsi), %eax +; BWON-NEXT: movb %al, (%rdx) +; BWON-NEXT: movzbl 1(%rsi), %eax +; BWON-NEXT: movb %al, 1(%rdx) +; BWON-NEXT: addq $8, %rdx +; BWON-NEXT: decl %edi +; BWON-NEXT: jne .LBB5_1 +; BWON-NEXT: .LBB5_2: # %._crit_edge +; BWON-NEXT: retq +; +; BWOFF-LABEL: no_merge_loads: +; BWOFF: # BB#0: +; BWOFF-NEXT: testl %edi, %edi +; BWOFF-NEXT: jle .LBB5_2 +; BWOFF-NEXT: .p2align 4, 0x90 +; BWOFF-NEXT: .LBB5_1: # %a4 +; BWOFF-NEXT: # =>This Inner Loop Header: Depth=1 +; BWOFF-NEXT: movb (%rsi), %al +; BWOFF-NEXT: movb %al, (%rdx) +; BWOFF-NEXT: movb 1(%rsi), %al +; BWOFF-NEXT: movb %al, 1(%rdx) +; BWOFF-NEXT: addq $8, %rdx +; BWOFF-NEXT: decl %edi +; BWOFF-NEXT: jne .LBB5_1 +; BWOFF-NEXT: .LBB5_2: # %._crit_edge +; BWOFF-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -214,14 +307,20 @@ a4: ; preds = %4, %.lr.ph ret void } - -; CHECK-LABEL: merge_loads_integer: -; load: -; CHECK: movq -; store: -; CHECK: movq -; CHECK: ret define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { +; CHECK-LABEL: merge_loads_integer: +; CHECK: # BB#0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB6_2 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq (%rsi), %rax +; CHECK-NEXT: movq %rax, (%rdx) +; CHECK-NEXT: addq $32, %rdx +; CHECK-NEXT: decl %edi +; CHECK-NEXT: jne .LBB6_1 +; CHECK-NEXT: .LBB6_2: # %._crit_edge +; CHECK-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -248,14 +347,21 @@ define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %s ret void } - -; CHECK-LABEL: merge_loads_vector: -; load: -; CHECK: movups -; store: -; CHECK: movups -; CHECK: ret define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { +; CHECK-LABEL: merge_loads_vector: +; CHECK: # BB#0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB7_2 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB7_1: # %block4 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovups (%rsi), %xmm0 +; CHECK-NEXT: vmovups %xmm0, (%rdx) +; CHECK-NEXT: addq $32, %rdx +; CHECK-NEXT: decl %edi +; CHECK-NEXT: jne .LBB7_1 +; CHECK-NEXT: .LBB7_2: # %._crit_edge +; CHECK-NEXT: retq %a1 = icmp sgt i32 %count, 0 br i1 %a1, label %.lr.ph, label %._crit_edge @@ -290,14 +396,22 @@ block4: ; preds = %4, %.lr.ph ret void } -;; On x86, even unaligned copies can be merged to vector ops. -; CHECK-LABEL: merge_loads_no_align: -; load: -; CHECK: vmovups -; store: -; CHECK: vmovups -; CHECK: ret +; On x86, even unaligned copies can be merged to vector ops. define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { +; CHECK-LABEL: merge_loads_no_align: +; CHECK: # BB#0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB8_2 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB8_1: # %block4 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovups (%rsi), %xmm0 +; CHECK-NEXT: vmovups %xmm0, (%rdx) +; CHECK-NEXT: addq $32, %rdx +; CHECK-NEXT: decl %edi +; CHECK-NEXT: jne .LBB8_1 +; CHECK-NEXT: .LBB8_2: # %._crit_edge +; CHECK-NEXT: retq %a1 = icmp sgt i32 %count, 0 br i1 %a1, label %.lr.ph, label %._crit_edge @@ -334,11 +448,36 @@ block4: ; preds = %4, %.lr.ph ; Make sure that we merge the consecutive load/store sequence below and use a ; word (16 bit) instead of a byte copy. -; CHECK-LABEL: MergeLoadStoreBaseIndexOffset: -; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] -; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] -; CHECK: movw %[[REG]], (%{{.*}}) define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { +; BWON-LABEL: MergeLoadStoreBaseIndexOffset: +; BWON: # BB#0: +; BWON-NEXT: movl %ecx, %r8d +; BWON-NEXT: xorl %ecx, %ecx +; BWON-NEXT: .p2align 4, 0x90 +; BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 +; BWON-NEXT: movq (%rdi,%rcx,8), %rax +; BWON-NEXT: movzwl (%rdx,%rax), %eax +; BWON-NEXT: movw %ax, (%rsi,%rcx,2) +; BWON-NEXT: incq %rcx +; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: jne .LBB9_1 +; BWON-NEXT: # BB#2: +; BWON-NEXT: retq +; +; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset: +; BWOFF: # BB#0: +; BWOFF-NEXT: movl %ecx, %r8d +; BWOFF-NEXT: xorl %ecx, %ecx +; BWOFF-NEXT: .p2align 4, 0x90 +; BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 +; BWOFF-NEXT: movq (%rdi,%rcx,8), %rax +; BWOFF-NEXT: movw (%rdx,%rax), %ax +; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2) +; BWOFF-NEXT: incq %rcx +; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: jne .LBB9_1 +; BWOFF-NEXT: # BB#2: +; BWOFF-NEXT: retq br label %1 ; <label>:1 @@ -366,12 +505,36 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { ; Make sure that we merge the consecutive load/store sequence below and use a ; word (16 bit) instead of a byte copy for complicated address calculation. -; . -; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetComplicated: -; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] -; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] -; CHECK: movw %[[REG]], (%{{.*}}) define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) { +; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated: +; BWON: # BB#0: +; BWON-NEXT: xorl %r8d, %r8d +; BWON-NEXT: .p2align 4, 0x90 +; BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 +; BWON-NEXT: movsbq (%rsi), %rax +; BWON-NEXT: movzwl (%rdx,%rax), %eax +; BWON-NEXT: movw %ax, (%rdi,%r8) +; BWON-NEXT: incq %rsi +; BWON-NEXT: addq $2, %r8 +; BWON-NEXT: cmpq %rcx, %r8 +; BWON-NEXT: jl .LBB10_1 +; BWON-NEXT: # BB#2: +; BWON-NEXT: retq +; +; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated: +; BWOFF: # BB#0: +; BWOFF-NEXT: xorl %r8d, %r8d +; BWOFF-NEXT: .p2align 4, 0x90 +; BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 +; BWOFF-NEXT: movsbq (%rsi), %rax +; BWOFF-NEXT: movw (%rdx,%rax), %ax +; BWOFF-NEXT: movw %ax, (%rdi,%r8) +; BWOFF-NEXT: incq %rsi +; BWOFF-NEXT: addq $2, %r8 +; BWOFF-NEXT: cmpq %rcx, %r8 +; BWOFF-NEXT: jl .LBB10_1 +; BWOFF-NEXT: # BB#2: +; BWOFF-NEXT: retq br label %1 ; <label>:1 @@ -401,11 +564,36 @@ define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i6 ; Make sure that we merge the consecutive load/store sequence below and use a ; word (16 bit) instead of a byte copy even if there are intermediate sign ; extensions. -; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext: -; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] -; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] -; CHECK: movw %[[REG]], (%{{.*}}) define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) { +; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext: +; BWON: # BB#0: +; BWON-NEXT: movl %ecx, %r8d +; BWON-NEXT: xorl %ecx, %ecx +; BWON-NEXT: .p2align 4, 0x90 +; BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; BWON-NEXT: movsbq (%rdi,%rcx), %rax +; BWON-NEXT: movzwl (%rdx,%rax), %eax +; BWON-NEXT: movw %ax, (%rsi,%rcx,2) +; BWON-NEXT: incq %rcx +; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: jne .LBB11_1 +; BWON-NEXT: # BB#2: +; BWON-NEXT: retq +; +; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext: +; BWOFF: # BB#0: +; BWOFF-NEXT: movl %ecx, %r8d +; BWOFF-NEXT: xorl %ecx, %ecx +; BWOFF-NEXT: .p2align 4, 0x90 +; BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax +; BWOFF-NEXT: movw (%rdx,%rax), %ax +; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2) +; BWOFF-NEXT: incq %rcx +; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: jne .LBB11_1 +; BWOFF-NEXT: # BB#2: +; BWOFF-NEXT: retq br label %1 ; <label>:1 @@ -434,10 +622,44 @@ define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) { ; However, we can only merge ignore sign extensions when they are on all memory ; computations; -; CHECK-LABEL: loadStoreBaseIndexOffsetSextNoSex: -; CHECK-NOT: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]] -; CHECK-NOT: movw [[REG]], (%{{.*}}) define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) { +; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex: +; BWON: # BB#0: +; BWON-NEXT: movl %ecx, %r8d +; BWON-NEXT: xorl %ecx, %ecx +; BWON-NEXT: .p2align 4, 0x90 +; BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 +; BWON-NEXT: movsbq (%rdi,%rcx), %rax +; BWON-NEXT: movzbl (%rdx,%rax), %r9d +; BWON-NEXT: incb %al +; BWON-NEXT: movsbq %al, %rax +; BWON-NEXT: movzbl (%rdx,%rax), %eax +; BWON-NEXT: movb %r9b, (%rsi,%rcx,2) +; BWON-NEXT: movb %al, 1(%rsi,%rcx,2) +; BWON-NEXT: incq %rcx +; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: jne .LBB12_1 +; BWON-NEXT: # BB#2: +; BWON-NEXT: retq +; +; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex: +; BWOFF: # BB#0: +; BWOFF-NEXT: movl %ecx, %r8d +; BWOFF-NEXT: xorl %ecx, %ecx +; BWOFF-NEXT: .p2align 4, 0x90 +; BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 +; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax +; BWOFF-NEXT: movb (%rdx,%rax), %r9b +; BWOFF-NEXT: incb %al +; BWOFF-NEXT: movsbq %al, %rax +; BWOFF-NEXT: movb (%rdx,%rax), %al +; BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2) +; BWOFF-NEXT: movb %al, 1(%rsi,%rcx,2) +; BWOFF-NEXT: incq %rcx +; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: jne .LBB12_1 +; BWOFF-NEXT: # BB#2: +; BWOFF-NEXT: retq br label %1 ; <label>:1 @@ -467,6 +689,11 @@ define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) { ; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 ) define void @merge_vec_element_store(<8 x float> %v, float* %ptr) { +; CHECK-LABEL: merge_vec_element_store: +; CHECK: # BB#0: +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %vecext0 = extractelement <8 x float> %v, i32 0 %vecext1 = extractelement <8 x float> %v, i32 1 %vecext2 = extractelement <8 x float> %v, i32 2 @@ -492,15 +719,17 @@ define void @merge_vec_element_store(<8 x float> %v, float* %ptr) { store float %vecext7, float* %arrayidx7, align 4 ret void -; CHECK-LABEL: merge_vec_element_store -; CHECK: vmovups %ymm0, (%rdi) -; CHECK: vzeroupper -; CHECK: retq } ; PR21711 - Merge vector stores into wider vector stores. ; These should be merged into 32-byte stores. define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) { +; CHECK-LABEL: merge_vec_extract_stores: +; CHECK: # BB#0: +; CHECK-NEXT: vmovups %ymm0, 48(%rdi) +; CHECK-NEXT: vmovups %ymm1, 80(%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 @@ -515,15 +744,16 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x flo store <4 x float> %shuffle3, <4 x float>* %idx3, align 16 ret void -; CHECK-LABEL: merge_vec_extract_stores -; CHECK: vmovups %ymm0, 48(%rdi) -; CHECK-NEXT: vmovups %ymm1, 80(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq } ; Merging vector stores when sourced from vector loads. define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) { +; CHECK-LABEL: merge_vec_stores_from_loads: +; CHECK: # BB#0: +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0 %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1 %v0 = load <4 x float>, <4 x float>* %load_idx0 @@ -534,31 +764,32 @@ define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) { store <4 x float> %v1, <4 x float>* %store_idx1, align 16 ret void -; CHECK-LABEL: merge_vec_stores_from_loads -; CHECK: vmovups (%rdi), %ymm0 -; CHECK-NEXT: vmovups %ymm0, (%rsi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq } -; Merging vector stores when sourced from a constant vector is not currently handled. +; Merging vector stores when sourced from a constant vector is not currently handled. define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) { +; CHECK-LABEL: merge_vec_stores_of_constants: +; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, 48(%rdi) +; CHECK-NEXT: vmovaps %xmm0, 64(%rdi) +; CHECK-NEXT: retq %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16 ret void -; CHECK-LABEL: merge_vec_stores_of_constants -; CHECK: vxorps -; CHECK-NEXT: vmovaps -; CHECK-NEXT: vmovaps -; CHECK-NEXT: retq } ; This is a minimized test based on real code that was failing. ; This should now be merged. define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { +; CHECK-LABEL: merge_vec_element_and_scalar_load: +; CHECK: # BB#0: +; CHECK-NEXT: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups %xmm0, 32(%rdi) +; CHECK-NEXT: retq %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1 %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4 @@ -573,14 +804,16 @@ define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { store i64 %a1, i64* %idx5, align 8 ret void -; CHECK-LABEL: merge_vec_element_and_scalar_load -; CHECK: vmovups (%rdi), %xmm0 -; CHECK-NEXT: vmovups %xmm0, 32(%rdi) -; CHECK-NEXT: retq } ; Don't let a non-consecutive store thwart merging of the last two. define void @almost_consecutive_stores(i8* %p) { +; CHECK-LABEL: almost_consecutive_stores: +; CHECK: # BB#0: +; CHECK-NEXT: movb $0, (%rdi) +; CHECK-NEXT: movb $1, 42(%rdi) +; CHECK-NEXT: movw $770, 2(%rdi) # imm = 0x302 +; CHECK-NEXT: retq store i8 0, i8* %p %p1 = getelementptr i8, i8* %p, i64 42 store i8 1, i8* %p1 @@ -589,17 +822,15 @@ define void @almost_consecutive_stores(i8* %p) { %p3 = getelementptr i8, i8* %p, i64 3 store i8 3, i8* %p3 ret void -; CHECK-LABEL: almost_consecutive_stores -; CHECK-DAG: movb $0, (%rdi) -; CHECK-DAG: movb $1, 42(%rdi) -; CHECK-DAG: movw $770, 2(%rdi) -; CHECK: retq } ; We should be able to merge these. define void @merge_bitcast(<4 x i32> %v, float* %ptr) { +; CHECK-LABEL: merge_bitcast: +; CHECK: # BB#0: +; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: retq %fv = bitcast <4 x i32> %v to <4 x float> - %vecext1 = extractelement <4 x i32> %v, i32 1 %vecext2 = extractelement <4 x i32> %v, i32 2 %vecext3 = extractelement <4 x i32> %v, i32 3 @@ -616,8 +847,4 @@ define void @merge_bitcast(<4 x i32> %v, float* %ptr) { store float %f2, float* %idx2, align 4 store float %f3, float* %idx3, align 4 ret void - -; CHECK-LABEL: merge_bitcast -; CHECK: vmovups %xmm0, (%rdi) -; CHECK-NEXT: retq } |

