diff options
Diffstat (limited to 'llvm/test/Transforms/MemCpyOpt')
31 files changed, 2171 insertions, 0 deletions
diff --git a/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll b/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll new file mode 100644 index 00000000000..7ff149f6f77 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -basicaa -memcpyopt -dse -S | grep "call.*initialize" | not grep memtmp +; PR2077 + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" +target triple = "i386-pc-linux-gnu" + +%0 = type { x86_fp80, x86_fp80 } + +define internal fastcc void @initialize(%0* noalias nocapture sret %agg.result) nounwind { +entry: + %agg.result.03 = getelementptr %0, %0* %agg.result, i32 0, i32 0 + store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.03 + %agg.result.15 = getelementptr %0, %0* %agg.result, i32 0, i32 1 + store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.15 + ret void +} + +declare fastcc x86_fp80 @passed_uninitialized(%0* nocapture) nounwind + +define fastcc void @badly_optimized() nounwind { +entry: + %z = alloca %0 + %tmp = alloca %0 + %memtmp = alloca %0, align 8 + call fastcc void @initialize(%0* noalias sret %memtmp) + %tmp1 = bitcast %0* %tmp to i8* + %memtmp2 = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %tmp1, i8* align 8 %memtmp2, i32 24, i1 false) + %z3 = bitcast %0* %z to i8* + %tmp4 = bitcast %0* %tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %z3, i8* align 8 %tmp4, i32 24, i1 false) + %tmp5 = call fastcc x86_fp80 @passed_uninitialized(%0* %z) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind diff --git a/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll b/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll new file mode 100644 index 00000000000..26c221d6786 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -basicaa -memcpyopt -S | not grep "call.*memcpy." +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" + +%a = type { i32 } +%b = type { float } + +declare void @g(%a* nocapture) + +define float @f() { +entry: + %a_var = alloca %a + %b_var = alloca %b, align 1 + call void @g(%a* %a_var) + %a_i8 = bitcast %a* %a_var to i8* + %b_i8 = bitcast %b* %b_var to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_i8, i8* %a_i8, i32 4, i1 false) + %tmp1 = getelementptr %b, %b* %b_var, i32 0, i32 0 + %tmp2 = load float, float* %tmp1 + ret float %tmp2 +} + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind diff --git a/llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll b/llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll new file mode 100644 index 00000000000..8e4a0230d7f --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -basicaa -memcpyopt -S | FileCheck %s +; PR10067 +; Make sure the call+copy isn't optimized in such a way that +; %ret ends up with the wrong value. + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" +target triple = "i386-apple-darwin10" + +%struct1 = type { i32, i32 } +%struct2 = type { %struct1, i8* } + +declare void @bar(%struct1* nocapture sret %agg.result) nounwind + +define i32 @foo() nounwind { + %x = alloca %struct1, align 8 + %y = alloca %struct2, align 8 + call void @bar(%struct1* sret %x) nounwind +; CHECK: call void @bar(%struct1* sret %x) + + %gepn1 = getelementptr inbounds %struct2, %struct2* %y, i32 0, i32 0, i32 0 + store i32 0, i32* %gepn1, align 8 + %gepn2 = getelementptr inbounds %struct2, %struct2* %y, i32 0, i32 0, i32 1 + store i32 0, i32* %gepn2, align 4 + + %bit1 = bitcast %struct1* %x to i64* + %bit2 = bitcast %struct2* %y to i64* + %load = load i64, i64* %bit1, align 8 + store i64 %load, i64* %bit2, align 8 + +; CHECK: %load = load i64, i64* %bit1, align 8 +; CHECK: store i64 %load, i64* %bit2, align 8 + + %gep1 = getelementptr %struct2, %struct2* %y, i32 0, i32 0, i32 0 + %ret = load i32, i32* %gep1 + ret i32 %ret +} diff --git a/llvm/test/Transforms/MemCpyOpt/align.ll b/llvm/test/Transforms/MemCpyOpt/align.ll new file mode 100644 index 00000000000..738928bac62 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/align.ll @@ -0,0 +1,37 @@ +; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +; The resulting memset is only 4-byte aligned, despite containing +; a 16-byte aligned store in the middle. + +define void @foo(i32* %p) { +; CHECK-LABEL: @foo( +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 {{.*}}, i8 0, i64 16, i1 false) + %a0 = getelementptr i32, i32* %p, i64 0 + store i32 0, i32* %a0, align 4 + %a1 = getelementptr i32, i32* %p, i64 1 + store i32 0, i32* %a1, align 16 + %a2 = getelementptr i32, i32* %p, i64 2 + store i32 0, i32* %a2, align 4 + %a3 = getelementptr i32, i32* %p, i64 3 + store i32 0, i32* %a3, align 4 + ret void +} + +; Replacing %a8 with %a4 in the memset requires boosting the alignment of %a4. + +define void @bar() { +; CHECK-LABEL: @bar( +; CHECK: %a4 = alloca i32, align 8 +; CHECK-NOT: memcpy + %a4 = alloca i32, align 4 + %a8 = alloca i32, align 8 + %a8.cast = bitcast i32* %a8 to i8* + %a4.cast = bitcast i32* %a4 to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %a8.cast, i8 0, i64 4, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a4.cast, i8* align 4 %a8.cast, i64 4, i1 false) + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/atomic.ll b/llvm/test/Transforms/MemCpyOpt/atomic.ll new file mode 100644 index 00000000000..f8fc39faa58 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/atomic.ll @@ -0,0 +1,41 @@ +; RUN: opt -basicaa -memcpyopt -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-macosx10.7.0" + +@x = global i32 0 + +declare void @otherf(i32*) + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +; memcpyopt should not touch atomic ops +define void @test1() nounwind uwtable ssp { +; CHECK: test1 +; CHECK: store atomic + %x = alloca [101 x i32], align 16 + %bc = bitcast [101 x i32]* %x to i8* + call void @llvm.memset.p0i8.i64(i8* align 16 %bc, i8 0, i64 400, i1 false) + %gep1 = getelementptr inbounds [101 x i32], [101 x i32]* %x, i32 0, i32 100 + store atomic i32 0, i32* %gep1 unordered, align 4 + %gep2 = getelementptr inbounds [101 x i32], [101 x i32]* %x, i32 0, i32 0 + call void @otherf(i32* %gep2) + ret void +} + +; memcpyopt across unordered store +define void @test2() nounwind uwtable ssp { +; CHECK: test2 +; CHECK: call +; CHECK-NEXT: store atomic +; CHECK-NEXT: call + %old = alloca i32 + %new = alloca i32 + call void @otherf(i32* nocapture %old) + store atomic i32 0, i32* @x unordered, align 4 + %v = load i32, i32* %old + store i32 %v, i32* %new + call void @otherf(i32* nocapture %new) + ret void +} + diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_aa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_aa.ll new file mode 100644 index 00000000000..d840b726a9a --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/callslot_aa.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%T = type { i64, i64 } + +define void @test(i8* %src) { + %tmp = alloca i8 + %dst = alloca i8 +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %src, i64 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmp, i8* align 8 %src, i64 1, i1 false), !noalias !2 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %tmp, i64 1, i1 false) + + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1) + +; Check that the noalias for "dst" was removed by checking that the metadata is gone +; CHECK-NOT: "dst" +!0 = !{!0} +!1 = distinct !{!1, !0, !"dst"} +!2 = distinct !{!1} diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_deref.ll b/llvm/test/Transforms/MemCpyOpt/callslot_deref.ll new file mode 100644 index 00000000000..a1ba2bae79e --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/callslot_deref.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) unnamed_addr nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +; all bytes of %dst that are touch by the memset are dereferenceable +define void @must_remove_memcpy(i8* noalias nocapture dereferenceable(4096) %dst) { +; CHECK-LABEL: @must_remove_memcpy( +; CHECK: call void @llvm.memset.p0i8.i64 +; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64 + %src = alloca [4096 x i8], align 1 + %p = getelementptr inbounds [4096 x i8], [4096 x i8]* %src, i64 0, i64 0 + call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %p, i64 4096, i1 false) #2 + ret void +} + +; memset touch more bytes than those guaranteed to be dereferenceable +; We can't remove the memcpy, but we can turn it into an independent memset. +define void @must_not_remove_memcpy(i8* noalias nocapture dereferenceable(1024) %dst) { +; CHECK-LABEL: @must_not_remove_memcpy( +; CHECK: call void @llvm.memset.p0i8.i64 +; CHECK: call void @llvm.memset.p0i8.i64 + %src = alloca [4096 x i8], align 1 + %p = getelementptr inbounds [4096 x i8], [4096 x i8]* %src, i64 0, i64 0 + call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %p, i64 4096, i1 false) #2 + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_throw.ll b/llvm/test/Transforms/MemCpyOpt/callslot_throw.ll new file mode 100644 index 00000000000..1aa4c92efc7 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/callslot_throw.ll @@ -0,0 +1,34 @@ +; RUN: opt -S -memcpyopt < %s | FileCheck %s +declare void @may_throw(i32* nocapture %x) + +; CHECK-LABEL: define void @test1( +define void @test1(i32* nocapture noalias dereferenceable(4) %x) { +entry: + %t = alloca i32, align 4 + call void @may_throw(i32* nonnull %t) + %load = load i32, i32* %t, align 4 + store i32 %load, i32* %x, align 4 +; CHECK: %[[t:.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @may_throw(i32* {{.*}} %[[t]]) +; CHECK-NEXT: %[[load:.*]] = load i32, i32* %[[t]], align 4 +; CHECK-NEXT: store i32 %[[load]], i32* %x, align 4 + ret void +} + +declare void @always_throws() + +; CHECK-LABEL: define void @test2( +define void @test2(i32* nocapture noalias dereferenceable(4) %x) { +entry: + %t = alloca i32, align 4 + call void @may_throw(i32* nonnull %t) nounwind + %load = load i32, i32* %t, align 4 + call void @always_throws() + store i32 %load, i32* %x, align 4 +; CHECK: %[[t:.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @may_throw(i32* {{.*}} %[[t]]) +; CHECK-NEXT: %[[load:.*]] = load i32, i32* %[[t]], align 4 +; CHECK-NEXT: call void @always_throws() +; CHECK-NEXT: store i32 %[[load]], i32* %x, align 4 + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/capturing-func.ll b/llvm/test/Transforms/MemCpyOpt/capturing-func.ll new file mode 100644 index 00000000000..2671a9aad81 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/capturing-func.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -basicaa -memcpyopt -S | FileCheck %s + +target datalayout = "e" + +declare void @foo(i8*) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind + +define void @test() { + %ptr1 = alloca i8 + %ptr2 = alloca i8 + call void @foo(i8* %ptr2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i1 false) + call void @foo(i8* %ptr1) + ret void + + ; Check that the transformation isn't applied if the called function can + ; capture the pointer argument (i.e. the nocapture attribute isn't present) + ; CHECK-LABEL: @test( + ; CHECK: call void @foo(i8* %ptr2) + ; CHECK-NEXT: call void @llvm.memcpy + ; CHECK-NEXT: call void @foo(i8* %ptr1) +} diff --git a/llvm/test/Transforms/MemCpyOpt/crash.ll b/llvm/test/Transforms/MemCpyOpt/crash.ll new file mode 100644 index 00000000000..464a261e4e4 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/crash.ll @@ -0,0 +1,55 @@ +; RUN: opt < %s -basicaa -memcpyopt -disable-output + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" +target triple = "armv7-eabi" + +%struct.qw = type { [4 x float] } +%struct.bar = type { %struct.qw, %struct.qw, %struct.qw, %struct.qw, %struct.qw, float, float} + +; PR4882 +define void @test1(%struct.bar* %this) { +entry: + %0 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 0, i32 0, i32 0 + store float 0.000000e+00, float* %0, align 4 + %1 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 0, i32 0, i32 1 + store float 0.000000e+00, float* %1, align 4 + %2 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 0, i32 0, i32 2 + store float 0.000000e+00, float* %2, align 4 + %3 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 0, i32 0, i32 3 + store float 0.000000e+00, float* %3, align 4 + %4 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 1, i32 0, i32 0 + store float 0.000000e+00, float* %4, align 4 + %5 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 1, i32 0, i32 1 + store float 0.000000e+00, float* %5, align 4 + %6 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 1, i32 0, i32 2 + store float 0.000000e+00, float* %6, align 4 + %7 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 1, i32 0, i32 3 + store float 0.000000e+00, float* %7, align 4 + %8 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 3, i32 0, i32 1 + store float 0.000000e+00, float* %8, align 4 + %9 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 3, i32 0, i32 2 + store float 0.000000e+00, float* %9, align 4 + %10 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 3, i32 0, i32 3 + store float 0.000000e+00, float* %10, align 4 + %11 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 4, i32 0, i32 0 + store float 0.000000e+00, float* %11, align 4 + %12 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 4, i32 0, i32 1 + store float 0.000000e+00, float* %12, align 4 + %13 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 4, i32 0, i32 2 + store float 0.000000e+00, float* %13, align 4 + %14 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 4, i32 0, i32 3 + store float 0.000000e+00, float* %14, align 4 + %15 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 5 + store float 0.000000e+00, float* %15, align 4 + unreachable +} + +; PR8753 + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +define void @test2(i32 %cmd) nounwind { + call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* undef, i64 20, i1 false) nounwind + call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* undef, i64 20, i1 false) nounwind + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll new file mode 100644 index 00000000000..6ce1aee338d --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll @@ -0,0 +1,129 @@ +; RUN: opt -memcpyopt -S < %s | FileCheck %s + +target datalayout = "e-i64:64-f80:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +%S = type { i8*, i8, i32 } + +define void @copy(%S* %src, %S* %dst) { +; CHECK-LABEL: copy +; CHECK-NOT: load +; CHECK: call void @llvm.memmove.p0i8.p0i8.i64 +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S %1, %S* %dst + ret void +} + +define void @noaliassrc(%S* noalias %src, %S* %dst) { +; CHECK-LABEL: noaliassrc +; CHECK-NOT: load +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S %1, %S* %dst + ret void +} + +define void @noaliasdst(%S* %src, %S* noalias %dst) { +; CHECK-LABEL: noaliasdst +; CHECK-NOT: load +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S %1, %S* %dst + ret void +} + +define void @destroysrc(%S* %src, %S* %dst) { +; CHECK-LABEL: destroysrc +; CHECK: load %S, %S* %src +; CHECK: call void @llvm.memset.p0i8.i64 +; CHECK-NEXT: store %S %1, %S* %dst +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S zeroinitializer, %S* %src + store %S %1, %S* %dst + ret void +} + +define void @destroynoaliassrc(%S* noalias %src, %S* %dst) { +; CHECK-LABEL: destroynoaliassrc +; CHECK-NOT: load +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64 +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S zeroinitializer, %S* %src + store %S %1, %S* %dst + ret void +} + +define void @copyalias(%S* %src, %S* %dst) { +; CHECK-LABEL: copyalias +; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %S, %S* %src +; CHECK-NOT: load +; CHECK: call void @llvm.memmove.p0i8.p0i8.i64 +; CHECK-NEXT: store %S [[LOAD]], %S* %dst +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + %2 = load %S, %S* %src + store %S %1, %S* %dst + store %S %2, %S* %dst + ret void +} + +; If the store address is computed in a complex manner, make +; sure we lift the computation as well if needed and possible. +define void @addrproducer(%S* %src, %S* %dst) { +; CHECK-LABEL: addrproducer( +; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8* +; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i64 1 +; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8* +; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8* +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false) +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S undef, %S* %dst + %dst2 = getelementptr %S , %S* %dst, i64 1 + store %S %1, %S* %dst2 + ret void +} + +define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) { +; CHECK-LABEL: aliasaddrproducer( +; CHECK-NEXT: %[[SRC:[0-9]+]] = load %S, %S* %src +; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false) +; CHECK-NEXT: %dstindex = load i32, i32* %dstidptr +; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex +; CHECK-NEXT: store %S %[[SRC]], %S* %dst2 +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S undef, %S* %dst + %dstindex = load i32, i32* %dstidptr + %dst2 = getelementptr %S , %S* %dst, i32 %dstindex + store %S %1, %S* %dst2 + ret void +} + +define void @noaliasaddrproducer(%S* %src, %S* noalias %dst, i32* noalias %dstidptr) { +; CHECK-LABEL: noaliasaddrproducer( +; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8* +; CHECK-NEXT: %[[LOADED:[0-9]+]] = load i32, i32* %dstidptr +; CHECK-NEXT: %dstindex = or i32 %[[LOADED]], 1 +; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex +; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8* +; CHECK-NEXT: %[[SRCCAST2:[0-9]+]] = bitcast %S* %src to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST2]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SRCCAST]], i8 undef, i64 16, i1 false) +; CHECK-NEXT: ret void + %1 = load %S, %S* %src + store %S undef, %S* %src + %2 = load i32, i32* %dstidptr + %dstindex = or i32 %2, 1 + %dst2 = getelementptr %S , %S* %dst, i32 %dstindex + store %S %1, %S* %dst2 + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/form-memset.ll b/llvm/test/Transforms/MemCpyOpt/form-memset.ll new file mode 100644 index 00000000000..836a6107d24 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/form-memset.ll @@ -0,0 +1,301 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s + +; All the stores in this example should be merged into a single memset. + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i386-apple-darwin8" + +define void @test1(i8 signext %c) nounwind { +entry: + %x = alloca [19 x i8] ; <[19 x i8]*> [#uses=20] + %tmp = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 0 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp, align 1 + %tmp5 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 1 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp5, align 1 + %tmp9 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 2 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp9, align 1 + %tmp13 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 3 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp13, align 1 + %tmp17 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 4 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp17, align 1 + %tmp21 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 5 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp21, align 1 + %tmp25 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 6 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp25, align 1 + %tmp29 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 7 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp29, align 1 + %tmp33 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 8 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp33, align 1 + %tmp37 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 9 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp37, align 1 + %tmp41 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 10 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp41, align 1 + %tmp45 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 11 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp45, align 1 + %tmp49 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 12 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp49, align 1 + %tmp53 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 13 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp53, align 1 + %tmp57 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 14 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp57, align 1 + %tmp61 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 15 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp61, align 1 + %tmp65 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 16 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp65, align 1 + %tmp69 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 17 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp69, align 1 + %tmp73 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 18 ; <i8*> [#uses=1] + store i8 %c, i8* %tmp73, align 1 + %tmp76 = call i32 (...) @bar( [19 x i8]* %x ) nounwind + ret void +; CHECK-LABEL: @test1( +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64 +; CHECK-NOT: store +; CHECK: ret +} + +declare i32 @bar(...) + +%struct.MV = type { i16, i16 } + + +define void @test2() nounwind { +entry: + %ref_idx = alloca [8 x i8] ; <[8 x i8]*> [#uses=8] + %left_mvd = alloca [8 x %struct.MV] ; <[8 x %struct.MV]*> [#uses=17] + %up_mvd = alloca [8 x %struct.MV] ; <[8 x %struct.MV]*> [#uses=17] + %tmp20 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 7 ; <i8*> [#uses=1] + store i8 -1, i8* %tmp20, align 1 + %tmp23 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 6 ; <i8*> [#uses=1] + store i8 -1, i8* %tmp23, align 1 + %tmp26 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 5 ; <i8*> [#uses=1] + store i8 -1, i8* %tmp26, align 1 + %tmp29 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 4 ; <i8*> [#uses=1] + store i8 -1, i8* %tmp29, align 1 + %tmp32 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 3 ; <i8*> [#uses=1] + store i8 -1, i8* %tmp32, align 1 + %tmp35 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 2 ; <i8*> [#uses=1] + store i8 -1, i8* %tmp35, align 1 + %tmp38 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 1 ; <i8*> [#uses=1] + store i8 -1, i8* %tmp38, align 1 + %tmp41 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 0 ; <i8*> [#uses=2] + store i8 -1, i8* %tmp41, align 1 + %tmp43 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp43, align 2 + %tmp46 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp46, align 2 + %tmp57 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp57, align 2 + %tmp60 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp60, align 2 + %tmp71 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp71, align 2 + %tmp74 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp74, align 2 + %tmp85 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp85, align 2 + %tmp88 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp88, align 2 + %tmp99 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp99, align 2 + %tmp102 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp102, align 2 + %tmp113 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp113, align 2 + %tmp116 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp116, align 2 + %tmp127 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp127, align 2 + %tmp130 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp130, align 2 + %tmp141 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp141, align 8 + %tmp144 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp144, align 2 + %tmp148 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp148, align 2 + %tmp151 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp151, align 2 + %tmp162 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp162, align 2 + %tmp165 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp165, align 2 + %tmp176 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp176, align 2 + %tmp179 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp179, align 2 + %tmp190 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp190, align 2 + %tmp193 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp193, align 2 + %tmp204 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp204, align 2 + %tmp207 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp207, align 2 + %tmp218 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp218, align 2 + %tmp221 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp221, align 2 + %tmp232 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp232, align 2 + %tmp235 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp235, align 2 + %tmp246 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 0 ; <i16*> [#uses=1] + store i16 0, i16* %tmp246, align 8 + %tmp249 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 1 ; <i16*> [#uses=1] + store i16 0, i16* %tmp249, align 2 + %up_mvd252 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0 ; <%struct.MV*> [#uses=1] + %left_mvd253 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0 ; <%struct.MV*> [#uses=1] + call void @foo( %struct.MV* %up_mvd252, %struct.MV* %left_mvd253, i8* %tmp41 ) nounwind + ret void + +; CHECK-LABEL: @test2( +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %tmp41, i8 -1, i64 8, i1 false) +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 %0, i8 0, i64 32, i1 false) +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 %1, i8 0, i64 32, i1 false) +; CHECK-NOT: store +; CHECK: ret +} + +declare void @foo(%struct.MV*, %struct.MV*, i8*) + + +; Store followed by memset. +define void @test3(i32* nocapture %P) nounwind ssp { +entry: + %arrayidx = getelementptr inbounds i32, i32* %P, i64 1 + store i32 0, i32* %arrayidx, align 4 + %add.ptr = getelementptr inbounds i32, i32* %P, i64 2 + %0 = bitcast i32* %add.ptr to i8* + tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 11, i1 false) + ret void +; CHECK-LABEL: @test3( +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 15, i1 false) +} + +; store followed by memset, different offset scenario +define void @test4(i32* nocapture %P) nounwind ssp { +entry: + store i32 0, i32* %P, align 4 + %add.ptr = getelementptr inbounds i32, i32* %P, i64 1 + %0 = bitcast i32* %add.ptr to i8* + tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 11, i1 false) + ret void +; CHECK-LABEL: @test4( +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 15, i1 false) +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +; Memset followed by store. +define void @test5(i32* nocapture %P) nounwind ssp { +entry: + %add.ptr = getelementptr inbounds i32, i32* %P, i64 2 + %0 = bitcast i32* %add.ptr to i8* + tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 11, i1 false) + %arrayidx = getelementptr inbounds i32, i32* %P, i64 1 + store i32 0, i32* %arrayidx, align 4 + ret void +; CHECK-LABEL: @test5( +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 15, i1 false) +} + +;; Memset followed by memset. +define void @test6(i32* nocapture %P) nounwind ssp { +entry: + %0 = bitcast i32* %P to i8* + tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 12, i1 false) + %add.ptr = getelementptr inbounds i32, i32* %P, i64 3 + %1 = bitcast i32* %add.ptr to i8* + tail call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 12, i1 false) + ret void +; CHECK-LABEL: @test6( +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %2, i8 0, i64 24, i1 false) +} + +; More aggressive heuristic +; rdar://9892684 +define void @test7(i32* nocapture %c) nounwind optsize { + store i32 -1, i32* %c, align 4 + %1 = getelementptr inbounds i32, i32* %c, i32 1 + store i32 -1, i32* %1, align 4 + %2 = getelementptr inbounds i32, i32* %c, i32 2 + store i32 -1, i32* %2, align 4 + %3 = getelementptr inbounds i32, i32* %c, i32 3 + store i32 -1, i32* %3, align 4 + %4 = getelementptr inbounds i32, i32* %c, i32 4 + store i32 -1, i32* %4, align 4 +; CHECK-LABEL: @test7( +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %5, i8 -1, i64 20, i1 false) + ret void +} + +%struct.test8 = type { [4 x i32] } + +define void @test8() { +entry: + %memtmp = alloca %struct.test8, align 16 + %0 = bitcast %struct.test8* %memtmp to <4 x i32>* + store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16 + ret void +; CHECK-LABEL: @test8( +; CHECK: store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16 +} + +@test9buf = internal unnamed_addr global [16 x i64] zeroinitializer, align 16 + +define void @test9() nounwind { + store i8 -1, i8* bitcast ([16 x i64]* @test9buf to i8*), align 16 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 1), align 1 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 2), align 2 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 3), align 1 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 4), align 4 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 5), align 1 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 6), align 2 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 7), align 1 + store i8 -1, i8* bitcast (i64* getelementptr inbounds ([16 x i64], [16 x i64]* @test9buf, i64 0, i64 1) to i8*), align 8 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 9), align 1 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 10), align 2 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 11), align 1 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 12), align 4 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 13), align 1 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 14), align 2 + store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 15), align 1 + ret void +; CHECK-LABEL: @test9( +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([16 x i64]* @test9buf to i8*), i8 -1, i64 16, i1 false) +} + +; PR19092 +define void @test10(i8* nocapture %P) nounwind { + tail call void @llvm.memset.p0i8.i64(i8* %P, i8 0, i64 42, i1 false) + tail call void @llvm.memset.p0i8.i64(i8* %P, i8 0, i64 23, i1 false) + ret void +; CHECK-LABEL: @test10( +; CHECK-NOT: memset +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %P, i8 0, i64 42, i1 false) +; CHECK-NOT: memset +; CHECK: ret void +} + +; Memset followed by odd store. +define void @test11(i32* nocapture %P) nounwind ssp { +entry: + %add.ptr = getelementptr inbounds i32, i32* %P, i64 3 + %0 = bitcast i32* %add.ptr to i8* + tail call void @llvm.memset.p0i8.i64(i8* %0, i8 1, i64 11, i1 false) + %arrayidx = getelementptr inbounds i32, i32* %P, i64 0 + %arrayidx.cast = bitcast i32* %arrayidx to i96* + store i96 310698676526526814092329217, i96* %arrayidx.cast, align 4 + ret void +; CHECK-LABEL: @test11( +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 1, i64 23, i1 false) +} diff --git a/llvm/test/Transforms/MemCpyOpt/invariant.start.ll b/llvm/test/Transforms/MemCpyOpt/invariant.start.ll new file mode 100644 index 00000000000..4842114b584 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/invariant.start.ll @@ -0,0 +1,47 @@ +; MemCpy optimizations should take place even in presence of invariant.start +; RUN: opt < %s -basicaa -memcpyopt -dse -S | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" + +target triple = "i686-apple-darwin9" + +%0 = type { x86_fp80, x86_fp80 } +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) + +declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly + +; FIXME: The invariant.start does not modify %P. +; The intermediate alloca and one of the memcpy's should be eliminated, the +; other should be transformed to a memmove. +define void @test1(i8* %P, i8* %Q) nounwind { + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) + %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %P) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) + ret void +; CHECK-LABEL: @test1( +; CHECK-NEXT: %memtmp = alloca %0, align 16 +; CHECK-NEXT: %R = bitcast %0* %memtmp to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) +; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %P) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) +; CHECK-NEXT: ret void +} + + +; The invariant.start intrinsic does not inhibit tranforming the memcpy to a +; memset. +define void @test2(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: define void @test2( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) +; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %dst1) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst2, i8 %c, i64 128, i1 false) +; CHECK-NEXT: ret void + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %dst1) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i1 false) + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll new file mode 100644 index 00000000000..9ddf3f4f9c2 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll @@ -0,0 +1,25 @@ +; RUN: opt < %s -O1 -S | FileCheck %s + +; performCallSlotOptzn in MemCpy should not exchange the calls to +; @llvm.lifetime.start and @llvm.memcpy. + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) #1 +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +define void @_ZN4CordC2EOS_(i8* nocapture dereferenceable(16) %arg1) { +bb: +; CHECK-LABEL: @_ZN4CordC2EOS_ +; CHECK-NOT: call void @llvm.lifetime.start +; CHECK: ret void + %tmp = alloca [8 x i8], align 8 + %tmp5 = bitcast [8 x i8]* %tmp to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* %tmp5) + %tmp10 = getelementptr inbounds i8, i8* %tmp5, i64 7 + store i8 0, i8* %tmp10, align 1 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %arg1, i8* align 8 %tmp5, i64 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 16, i8* %tmp5) + ret void +} + +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll new file mode 100644 index 00000000000..9dbba093fe2 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -basicaa -scoped-noalias -memcpyopt -S %s | FileCheck %s + +%T = type { i8, i32 } + +; Ensure load-store forwarding of an aggregate is interpreted as +; a memmove when the source and dest may alias +define void @test_memmove(%T* align 8 %a, %T* align 16 %b) { +; CHECK-LABEL: @test_memmove( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %T* [[B:%.*]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %T* [[A:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 16 [[TMP1]], i8* align 8 [[TMP2]], i64 8, i1 false) +; CHECK-NEXT: ret void +; + %val = load %T, %T* %a, align 8 + store %T %val, %T* %b, align 16 + ret void +} + +; Ensure load-store forwarding of an aggregate is interpreted as +; a memcpy when the source and dest do not alias +define void @test_memcpy(%T* noalias align 8 %a, %T* noalias align 16 %b) { +; CHECK-LABEL: @test_memcpy( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %T* [[B:%.*]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %T* [[A:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP1]], i8* align 8 [[TMP2]], i64 8, i1 false) +; CHECK-NEXT: ret void +; + %val = load %T, %T* %a, align 8 + store %T %val, %T* %b, align 16 + ret void +} + +; memcpy(%d, %a) should not be generated since store2 may-aliases load %a. +define void @f(%T* %a, %T* %b, %T* %c, %T* %d) { +; CHECK-LABEL: @f( +; CHECK-NEXT: [[VAL:%.*]] = load %T, %T* %a, !alias.scope !0 +; CHECK-NEXT: store %T { i8 23, i32 23 }, %T* %b, !alias.scope !3 +; CHECK-NEXT: store %T { i8 44, i32 44 }, %T* %c, !alias.scope !6, !noalias !3 +; CHECK-NEXT: store %T [[VAL]], %T* %d, !alias.scope !9, !noalias !12 +; CHECK-NEXT: ret void +; + %val = load %T, %T* %a, !alias.scope !{!10} + + ; store1 may-aliases the load + store %T { i8 23, i32 23 }, %T* %b, !alias.scope !{!11} + + ; store2 may-aliases the load and store3 + store %T { i8 44, i32 44 }, %T* %c, !alias.scope !{!12}, !noalias !{!11} + + ; store3 + store %T %val, %T* %d, !alias.scope !{!13}, !noalias !{!10, !11} + ret void +} + +!0 = !{!0} +!1 = !{!1} +!2 = !{!2} +!3 = !{!3} + +!10 = !{ !10, !0 } +!11 = !{ !11, !1 } +!12 = !{ !12, !2 } +!13 = !{ !13, !3 } diff --git a/llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll b/llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll new file mode 100644 index 00000000000..4c6136cf625 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll @@ -0,0 +1,25 @@ +; RUN: opt -S < %s -basicaa -memcpyopt | FileCheck %s +; <rdar://problem/8536696> + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-darwin10.0.0" + +%"class.std::auto_ptr" = type { i32* } + +; CHECK-LABEL: @_Z3foov( +define void @_Z3foov(%"class.std::auto_ptr"* noalias nocapture sret %agg.result) ssp { +_ZNSt8auto_ptrIiED1Ev.exit: + %temp.lvalue = alloca %"class.std::auto_ptr", align 8 +; CHECK: call void @_Z3barv(%"class.std::auto_ptr"* sret %agg.result) + call void @_Z3barv(%"class.std::auto_ptr"* sret %temp.lvalue) + %tmp.i.i = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* %temp.lvalue, i64 0, i32 0 +; CHECK-NOT: load + %tmp2.i.i = load i32*, i32** %tmp.i.i, align 8 + %tmp.i.i4 = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* %agg.result, i64 0, i32 0 +; CHECK-NOT: store + store i32* %tmp2.i.i, i32** %tmp.i.i4, align 8 +; CHECK: ret void + ret void +} + +declare void @_Z3barv(%"class.std::auto_ptr"* nocapture sret) nounwind diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll new file mode 100644 index 00000000000..73d567a00ae --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll @@ -0,0 +1,55 @@ +; RUN: opt -basicaa -memcpyopt -instcombine -S < %s | FileCheck %s + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo([8 x i64]* noalias nocapture sret dereferenceable(64) %sret) { +entry-block: + %a = alloca [8 x i64], align 8 + %a.cast = bitcast [8 x i64]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 64, i8* %a.cast) + call void @llvm.memset.p0i8.i64(i8* align 8 %a.cast, i8 0, i64 64, i1 false) + %sret.cast = bitcast [8 x i64]* %sret to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %sret.cast, i8* align 8 %a.cast, i64 64, i1 false) + call void @llvm.lifetime.end.p0i8(i64 64, i8* %a.cast) + ret void + +; CHECK-LABEL: @foo( +; CHECK: %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[sret_cast]], i8 0, i64 64 +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} + +define void @bar([8 x i64]* noalias nocapture sret dereferenceable(64) %sret, [8 x i64]* noalias nocapture dereferenceable(64) %out) { +entry-block: + %a = alloca [8 x i64], align 8 + %a.cast = bitcast [8 x i64]* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 64, i8* %a.cast) + call void @llvm.memset.p0i8.i64(i8* align 8 %a.cast, i8 0, i64 64, i1 false) + %sret.cast = bitcast [8 x i64]* %sret to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %sret.cast, i8* align 8 %a.cast, i64 64, i1 false) + call void @llvm.memset.p0i8.i64(i8* align 8 %a.cast, i8 42, i64 32, i1 false) + %out.cast = bitcast [8 x i64]* %out to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %out.cast, i8* align 8 %a.cast, i64 64, i1 false) + call void @llvm.lifetime.end.p0i8(i64 64, i8* %a.cast) + ret void + +; CHECK-LABEL: @bar( +; CHECK: %[[a:[^=]+]] = alloca [8 x i64] +; CHECK: %[[a_cast:[^=]+]] = bitcast [8 x i64]* %[[a]] to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[a_cast]], i8 0, i64 64 +; CHECK: %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[sret_cast]], i8 0, i64 64 +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[a_cast]], i8 42, i64 32 +; CHECK: %[[out_cast:[^=]+]] = bitcast [8 x i64]* %out to i8* +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %[[out_cast]], i8* nonnull align 8 %[[a_cast]], i64 64 +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll new file mode 100644 index 00000000000..1424ca3709c --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll @@ -0,0 +1,89 @@ +; RUN: opt -memcpyopt -S < %s | FileCheck %s + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +@undef = internal constant i32 undef, align 4 +define void @test_undef() nounwind { + %a = alloca i32, align 4 + %i8 = bitcast i32* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (i32* @undef to i8*), i64 4, i1 false) + ret void +; CHECK-LABEL: @test_undef( +; CHECK: call void @llvm.memset +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} + +@i32x3 = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4 +define void @test_i32x3() nounwind { + %a = alloca [3 x i32], align 4 + %i8 = bitcast [3 x i32]* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3 to i8*), i64 12, i1 false) + ret void +; CHECK-LABEL: @test_i32x3( +; CHECK: call void @llvm.memset +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} + +@i32x3_undef = internal constant [3 x i32] [i32 -1, i32 undef, i32 -1], align 4 +define void @test_i32x3_undef() nounwind { + %a = alloca [3 x i32], align 4 + %i8 = bitcast [3 x i32]* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3_undef to i8*), i64 12, i1 false) + ret void +; CHECK-LABEL: @test_i32x3_undef( +; CHECK: call void @llvm.memset +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} + +%struct.bitfield = type { i8, [3 x i8] } +@bitfield = private unnamed_addr constant %struct.bitfield { i8 -86, [3 x i8] [i8 -86, i8 -86, i8 -86] }, align 4 +define void @test_bitfield() nounwind { + %a = alloca %struct.bitfield, align 4 + %i8 = bitcast %struct.bitfield* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (%struct.bitfield* @bitfield to i8*), i64 4, i1 false) + ret void +; CHECK-LABEL: @test_bitfield( +; CHECK: call void @llvm.memset +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} + +@i1x16_zero = internal constant <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, align 4 +define void @test_i1x16_zero() nounwind { + %a = alloca <16 x i1>, align 4 + %i8 = bitcast <16 x i1>* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_zero to i8*), i64 16, i1 false) + ret void +; CHECK-LABEL: @test_i1x16_zero( +; CHECK: call void @llvm.memset +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} + +; i1 isn't currently handled. Should it? +@i1x16_one = internal constant <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, align 4 +define void @test_i1x16_one() nounwind { + %a = alloca <16 x i1>, align 4 + %i8 = bitcast <16 x i1>* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_one to i8*), i64 16, i1 false) + ret void +; CHECK-LABEL: @test_i1x16_one( +; CHECK-NOT: call void @llvm.memset +; CHECK: call void @llvm.memcpy +; CHECK: ret void +} + +@half = internal constant half 0xH0000, align 4 +define void @test_half() nounwind { + %a = alloca half, align 4 + %i8 = bitcast half* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (half* @half to i8*), i64 2, i1 false) + ret void +; CHECK-LABEL: @test_half( +; CHECK: call void @llvm.memset +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll new file mode 100644 index 00000000000..9cf4f91e6aa --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -basicaa -memcpyopt -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.foo = type { i8, [7 x i8], i32 } + +define i32 @test1(%struct.foo* nocapture %foobie) nounwind noinline ssp uwtable { + %bletch.sroa.1 = alloca [7 x i8], align 1 + %1 = getelementptr inbounds %struct.foo, %struct.foo* %foobie, i64 0, i32 0 + store i8 98, i8* %1, align 4 + %2 = getelementptr inbounds %struct.foo, %struct.foo* %foobie, i64 0, i32 1, i64 0 + %3 = getelementptr inbounds [7 x i8], [7 x i8]* %bletch.sroa.1, i64 0, i64 0 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 7, i1 false) + %4 = getelementptr inbounds %struct.foo, %struct.foo* %foobie, i64 0, i32 2 + store i32 20, i32* %4, align 4 + ret i32 undef + +; Check that the memcpy is removed. +; CHECK-LABEL: @test1( +; CHECK-NOT: call void @llvm.memcpy +} + +define void @test2(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable { + call void @llvm.lifetime.start.p0i8(i64 8, i8* %in) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i1 false) + ret void + +; Check that the memcpy is removed. +; CHECK-LABEL: @test2( +; CHECK-NOT: call void @llvm.memcpy +} + +define void @test3(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable { + call void @llvm.lifetime.start.p0i8(i64 4, i8* %in) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i1 false) + ret void + +; Check that the memcpy is not removed. +; CHECK-LABEL: @test3( +; CHECK: call void @llvm.memcpy +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll new file mode 100644 index 00000000000..4c5f6cbeb1a --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -0,0 +1,253 @@ +; RUN: opt < %s -basicaa -memcpyopt -dse -S | FileCheck -enable-var-scope %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i686-apple-darwin9" + +%0 = type { x86_fp80, x86_fp80 } +%1 = type { i32, i32 } + +define void @test1(%0* sret %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind { +entry: + %tmp2 = alloca %0 + %memtmp = alloca %0, align 16 + %tmp5 = fsub x86_fp80 0xK80000000000000000000, %z.1 + call void @ccoshl(%0* sret %memtmp, x86_fp80 %tmp5, x86_fp80 %z.0) nounwind + %tmp219 = bitcast %0* %tmp2 to i8* + %memtmp20 = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %tmp219, i8* align 16 %memtmp20, i32 32, i1 false) + %agg.result21 = bitcast %0* %agg.result to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %agg.result21, i8* align 16 %tmp219, i32 32, i1 false) + ret void + +; Check that one of the memcpy's are removed. +;; FIXME: PR 8643 We should be able to eliminate the last memcpy here. + +; CHECK-LABEL: @test1( +; CHECK: call void @ccoshl +; CHECK: call void @llvm.memcpy +; CHECK-NOT: llvm.memcpy +; CHECK: ret void +} + +declare void @ccoshl(%0* nocapture sret, x86_fp80, x86_fp80) nounwind + + +; The intermediate alloca and one of the memcpy's should be eliminated, the +; other should be related with a memmove. +define void @test2(i8* %P, i8* %Q) nounwind { + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) + ret void + +; CHECK-LABEL: @test2( +; CHECK-NEXT: call void @llvm.memmove{{.*}}(i8* align 16 %Q, i8* align 16 %P +; CHECK-NEXT: ret void +} + +; The intermediate alloca and one of the memcpy's should be eliminated, the +; other should be related with a memcpy. +define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind { + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) + ret void + +; CHECK-LABEL: @test2_memcpy( +; CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* align 16 %Q, i8* align 16 %P +; CHECK-NEXT: ret void +} + + + + +@x = external global %0 + +define void @test3(%0* noalias sret %agg.result) nounwind { + %x.0 = alloca %0 + %x.01 = bitcast %0* %x.0 to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %x.01, i8* align 16 bitcast (%0* @x to i8*), i32 32, i1 false) + %agg.result2 = bitcast %0* %agg.result to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %agg.result2, i8* align 16 %x.01, i32 32, i1 false) + ret void +; CHECK-LABEL: @test3( +; CHECK-NEXT: %agg.result1 = bitcast +; CHECK-NEXT: call void @llvm.memcpy +; CHECK-NEXT: ret void +} + + +; PR8644 +define void @test4(i8 *%P) { + %A = alloca %1 + %a = bitcast %1* %A to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a, i8* align 4 %P, i64 8, i1 false) + call void @test4a(i8* align 1 byval %a) + ret void +; CHECK-LABEL: @test4( +; CHECK-NEXT: call void @test4a( +} + +; Make sure we don't remove the memcpy if the source address space doesn't match the byval argument +define void @test4_addrspace(i8 addrspace(1)* %P) { + %A = alloca %1 + %a = bitcast %1* %A to i8* + call void @llvm.memcpy.p0i8.p1i8.i64(i8* align 4 %a, i8 addrspace(1)* align 4 %P, i64 8, i1 false) + call void @test4a(i8* align 1 byval %a) + ret void +; CHECK-LABEL: @test4_addrspace( +; CHECK: call void @llvm.memcpy.p0i8.p1i8.i64( +; CHECK-NEXT: call void @test4a( +} + +declare void @test4a(i8* align 1 byval) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind + +%struct.S = type { i128, [4 x i8]} + +@sS = external global %struct.S, align 16 + +declare void @test5a(%struct.S* align 16 byval) nounwind ssp + + +; rdar://8713376 - This memcpy can't be eliminated. +define i32 @test5(i32 %x) nounwind ssp { +entry: + %y = alloca %struct.S, align 16 + %tmp = bitcast %struct.S* %y to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %tmp, i8* align 16 bitcast (%struct.S* @sS to i8*), i64 32, i1 false) + %a = getelementptr %struct.S, %struct.S* %y, i64 0, i32 1, i64 0 + store i8 4, i8* %a + call void @test5a(%struct.S* align 16 byval %y) + ret i32 0 + ; CHECK-LABEL: @test5( + ; CHECK: store i8 4 + ; CHECK: call void @test5a(%struct.S* byval align 16 %y) +} + +;; Noop memcpy should be zapped. +define void @test6(i8 *%P) { + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %P, i8* align 4 %P, i64 8, i1 false) + ret void +; CHECK-LABEL: @test6( +; CHECK-NEXT: ret void +} + + +; PR9794 - Should forward memcpy into byval argument even though the memcpy +; isn't itself 8 byte aligned. +%struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } + +define i32 @test7(%struct.p* nocapture align 8 byval %q) nounwind ssp { +entry: + %agg.tmp = alloca %struct.p, align 4 + %tmp = bitcast %struct.p* %agg.tmp to i8* + %tmp1 = bitcast %struct.p* %q to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %tmp, i8* align 4 %tmp1, i64 48, i1 false) + %call = call i32 @g(%struct.p* align 8 byval %agg.tmp) nounwind + ret i32 %call +; CHECK-LABEL: @test7( +; CHECK: call i32 @g(%struct.p* byval align 8 %q) [[$NUW:#[0-9]+]] +} + +declare i32 @g(%struct.p* align 8 byval) + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind + +; PR11142 - When looking for a memcpy-memcpy dependency, don't get stuck on +; instructions between the memcpy's that only affect the destination pointer. +@test8.str = internal constant [7 x i8] c"ABCDEF\00" + +define void @test8() { +; CHECK: test8 +; CHECK-NOT: memcpy + %A = tail call i8* @malloc(i32 10) + %B = getelementptr inbounds i8, i8* %A, i64 2 + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %B, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @test8.str, i64 0, i64 0), i32 7, i1 false) + %C = tail call i8* @malloc(i32 10) + %D = getelementptr inbounds i8, i8* %C, i64 2 + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %D, i8* %B, i32 7, i1 false) + ret void +; CHECK: ret void +} + +declare noalias i8* @malloc(i32) + +; rdar://11341081 +%struct.big = type { [50 x i32] } + +define void @test9_addrspacecast() nounwind ssp uwtable { +entry: +; CHECK-LABEL: @test9_addrspacecast( +; CHECK: f1 +; CHECK-NOT: memcpy +; CHECK: f2 + %b = alloca %struct.big, align 4 + %tmp = alloca %struct.big, align 4 + call void @f1(%struct.big* sret %tmp) + %0 = addrspacecast %struct.big* %b to i8 addrspace(1)* + %1 = addrspacecast %struct.big* %tmp to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %0, i8 addrspace(1)* align 4 %1, i64 200, i1 false) + call void @f2(%struct.big* %b) + ret void +} + +define void @test9() nounwind ssp uwtable { +entry: +; CHECK: test9 +; CHECK: f1 +; CHECK-NOT: memcpy +; CHECK: f2 + %b = alloca %struct.big, align 4 + %tmp = alloca %struct.big, align 4 + call void @f1(%struct.big* sret %tmp) + %0 = bitcast %struct.big* %b to i8* + %1 = bitcast %struct.big* %tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 200, i1 false) + call void @f2(%struct.big* %b) + ret void +} + +; rdar://14073661. +; Test10 triggered assertion when the compiler try to get the size of the +; opaque type of *x, where the x is the formal argument with attribute 'sret'. + +%opaque = type opaque +declare void @foo(i32* noalias nocapture) + +define void @test10(%opaque* noalias nocapture sret %x, i32 %y) { + %a = alloca i32, align 4 + store i32 %y, i32* %a + call void @foo(i32* noalias nocapture %a) + %c = load i32, i32* %a + %d = bitcast %opaque* %x to i32* + store i32 %c, i32* %d + ret void +} + +; don't create new addressspacecasts when we don't know they're safe for the target +define void @test11([20 x i32] addrspace(1)* nocapture dereferenceable(80) %P) { + %A = alloca [20 x i32], align 4 + %a = bitcast [20 x i32]* %A to i8* + %b = bitcast [20 x i32] addrspace(1)* %P to i8 addrspace(1)* + call void @llvm.memset.p0i8.i64(i8* align 4 %a, i8 0, i64 80, i1 false) + call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* align 4 %b, i8* align 4 %a, i64 80, i1 false) + ret void +; CHECK-LABEL: @test11( +; CHECK-NOT: addrspacecast +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind + +declare void @f1(%struct.big* nocapture sret) +declare void @f2(%struct.big*) + +; CHECK: attributes [[$NUW]] = { nounwind } +; CHECK: attributes #1 = { argmemonly nounwind } +; CHECK: attributes #2 = { nounwind ssp } +; CHECK: attributes #3 = { nounwind ssp uwtable } diff --git a/llvm/test/Transforms/MemCpyOpt/memmove.ll b/llvm/test/Transforms/MemCpyOpt/memmove.ll new file mode 100644 index 00000000000..91f2851ab78 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memmove.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -basicaa -memcpyopt -S | FileCheck %s +; These memmoves should get optimized to memcpys. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-apple-darwin9.0" + +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +define i8* @test1(i8* nocapture %src) nounwind { +entry: +; CHECK-LABEL: @test1( +; CHECK: call void @llvm.memcpy + + %malloccall = tail call i8* @malloc(i32 trunc (i64 mul nuw (i64 ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64), i64 13) to i32)) + %call3 = bitcast i8* %malloccall to [13 x i8]* + %call3.sub = getelementptr inbounds [13 x i8], [13 x i8]* %call3, i64 0, i64 0 + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %call3.sub, i8* %src, i64 13, i1 false) + ret i8* %call3.sub +} +declare noalias i8* @malloc(i32) + + +define void @test2(i8* %P) nounwind { +entry: +; CHECK-LABEL: @test2( +; CHECK: call void @llvm.memcpy + %add.ptr = getelementptr i8, i8* %P, i64 16 + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %add.ptr, i64 16, i1 false) + ret void +} + +; This cannot be optimize because the src/dst really do overlap. +define void @test3(i8* %P) nounwind { +entry: +; CHECK-LABEL: @test3( +; CHECK: call void @llvm.memmove + %add.ptr = getelementptr i8, i8* %P, i64 16 + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %add.ptr, i64 17, i1 false) + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll new file mode 100644 index 00000000000..7ee0682ed22 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -memcpyopt -S %s | FileCheck %s + +; memset -> memcpy forwarding, if memcpy is larger than memset, but trailing +; bytes are known to be undef. + + +%T = type { i64, i32, i32 } + +define void @test_alloca(i8* %result) { +; CHECK-LABEL: @test_alloca( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +define void @test_alloca_with_lifetimes(i8* %result) { +; CHECK-LABEL: @test_alloca_with_lifetimes( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[B]]) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[B]]) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* %b) + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 16, i8* %b) + ret void +} + +define void @test_malloc_with_lifetimes(i8* %result) { +; CHECK-LABEL: @test_malloc_with_lifetimes( +; CHECK-NEXT: [[A:%.*]] = call i8* @malloc(i64 16) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[A]]) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[A]]) +; CHECK-NEXT: call void @free(i8* [[A]]) +; CHECK-NEXT: ret void +; + %a = call i8* @malloc(i64 16) + call void @llvm.lifetime.start.p0i8(i64 16, i8* %a) + call void @llvm.memset.p0i8.i64(i8* align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %a, i64 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 16, i8* %a) + call void @free(i8* %a) + ret void +} + +; memcpy size is larger than lifetime, don't optimize. +define void @test_copy_larger_than_lifetime_size(i8* %result) { +; CHECK-LABEL: @test_copy_larger_than_lifetime_size( +; CHECK-NEXT: [[A:%.*]] = call i8* @malloc(i64 16) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 12, i8* [[A]]) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[A]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 12, i8* [[A]]) +; CHECK-NEXT: call void @free(i8* [[A]]) +; CHECK-NEXT: ret void +; + %a = call i8* @malloc(i64 16) + call void @llvm.lifetime.start.p0i8(i64 12, i8* %a) + call void @llvm.memset.p0i8.i64(i8* align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %a, i64 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 12, i8* %a) + call void @free(i8* %a) + ret void +} + +; The trailing bytes are not known to be undef, we can't ignore them. +define void @test_not_undef_memory(i8* %result, i8* %input) { +; CHECK-LABEL: @test_not_undef_memory( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[INPUT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[INPUT]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* align 8 %input, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %input, i64 16, i1 false) + ret void +} + +; Memset is volatile, memcpy is not. Can be optimized. +define void @test_volatile_memset(i8* %result) { +; CHECK-LABEL: @test_volatile_memset( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 true) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[RESULT:%.*]], i8 0, i64 12, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 true) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +; Memcpy is volatile, memset is not. Cannot be optimized. +define void @test_volatile_memcpy(i8* %result) { +; CHECK-LABEL: @test_volatile_memcpy( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 true) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 true) + ret void +} + +; Write between memset and memcpy, can't optimize. +define void @test_write_between(i8* %result) { +; CHECK-LABEL: @test_write_between( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false) +; CHECK-NEXT: store i8 -1, i8* [[B]] +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 12, i1 false) + store i8 -1, i8* %b + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +; A write prior to the memset, which is part of the memset region. +; We could optimize this, but currently don't, because the used memory location is imprecise. +define void @test_write_before_memset_in_memset_region(i8* %result) { +; CHECK-LABEL: @test_write_before_memset_in_memset_region( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: store i8 -1, i8* [[B]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + store i8 -1, i8* %b + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +; A write prior to the memset, which is part of the memcpy (but not memset) region. +; This cannot be optimized. +define void @test_write_before_memset_in_memcpy_region(i8* %result) { +; CHECK-LABEL: @test_write_before_memset_in_memcpy_region( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 2 +; CHECK-NEXT: store i32 -1, i32* [[C]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + %c = getelementptr inbounds %T, %T* %a, i64 0, i32 2 + store i32 -1, i32* %c + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +; A write prior to the memset, which is part of both the memset and memcpy regions. +; This cannot be optimized. +define void @test_write_before_memset_in_both_regions(i8* %result) { +; CHECK-LABEL: @test_write_before_memset_in_both_regions( +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 +; CHECK-NEXT: [[B:%.*]] = bitcast %T* [[A]] to i8* +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 1 +; CHECK-NEXT: store i32 -1, i32* [[C]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 10, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca %T, align 8 + %b = bitcast %T* %a to i8* + %c = getelementptr inbounds %T, %T* %a, i64 0, i32 1 + store i32 -1, i32* %c + call void @llvm.memset.p0i8.i64(i8* align 8 %b, i8 0, i64 10, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %result, i8* align 8 %b, i64 16, i1 false) + ret void +} + +declare i8* @malloc(i64) +declare void @free(i8*) + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll new file mode 100644 index 00000000000..a3ca96c3ab5 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll @@ -0,0 +1,168 @@ +; RUN: opt -basicaa -memcpyopt -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define void @test +; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 %dst_size, %src_size +; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 %dst_size, %src_size +; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]] +; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 %src_size +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false) +; CHECK-NEXT: ret void +define void @test(i8* %src, i64 %src_size, i8* %dst, i64 %dst_size, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 %c, i64 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types_i32_i64 +; CHECK: [[DSTSIZE:%[0-9]+]] = zext i32 %dst_size to i64 +; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 [[DSTSIZE]], %src_size +; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 [[DSTSIZE]], %src_size +; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]] +; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 %src_size +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types_i32_i64(i8* %dst, i8* %src, i32 %dst_size, i64 %src_size, i8 %c) { + call void @llvm.memset.p0i8.i32(i8* %dst, i8 %c, i32 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types_i128_i32 +; CHECK: [[SRCSIZE:%[0-9]+]] = zext i32 %src_size to i128 +; CHECK: [[ULE:%[0-9]+]] = icmp ule i128 %dst_size, [[SRCSIZE]] +; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i128 %dst_size, [[SRCSIZE]] +; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i128 0, i128 [[SIZEDIFF]] +; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i128 [[SRCSIZE]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i128(i8* align 1 [[DST]], i8 %c, i128 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types_i128_i32(i8* %dst, i8* %src, i128 %dst_size, i32 %src_size, i8 %c) { + call void @llvm.memset.p0i8.i128(i8* %dst, i8 %c, i128 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types_i32_i128 +; CHECK: [[DSTSIZE:%[0-9]+]] = zext i32 %dst_size to i128 +; CHECK: [[ULE:%[0-9]+]] = icmp ule i128 [[DSTSIZE]], %src_size +; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i128 [[DSTSIZE]], %src_size +; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i128 0, i128 [[SIZEDIFF]] +; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i128 %src_size +; CHECK-NEXT: call void @llvm.memset.p0i8.i128(i8* align 1 [[DST]], i8 %c, i128 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i128(i8* %dst, i8* %src, i128 %src_size, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types_i32_i128(i8* %dst, i8* %src, i32 %dst_size, i128 %src_size, i8 %c) { + call void @llvm.memset.p0i8.i32(i8* %dst, i8 %c, i32 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i128(i8* %dst, i8* %src, i128 %src_size, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types_i64_i32 +; CHECK: [[SRCSIZE:%[0-9]+]] = zext i32 %src_size to i64 +; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 %dst_size, [[SRCSIZE]] +; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 %dst_size, [[SRCSIZE]] +; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]] +; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 [[SRCSIZE]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types_i64_i32(i8* %dst, i8* %src, i64 %dst_size, i32 %src_size, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 %c, i64 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_same +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 {{.*}}, i8 0, i64 {{.*}}, i1 false) +define void @test_align_same(i8* %src, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* align 8 %dst, i8 0, i64 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 80, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_min +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 {{.*}}, i8 0, i64 {{.*}}, i1 false) +define void @test_align_min(i8* %src, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* align 8 %dst, i8 0, i64 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 36, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_memcpy +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 {{.*}}, i8 0, i64 {{.*}}, i1 false) +define void @test_align_memcpy(i8* %src, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %src, i64 80, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_non_i8_dst_type +; CHECK-NEXT: %dst = bitcast i64* %dst_pi64 to i8* +; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 %dst_size, %src_size +; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 %dst_size, %src_size +; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]] +; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 %src_size +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false) +; CHECK-NEXT: ret void +define void @test_non_i8_dst_type(i8* %src, i64 %src_size, i64* %dst_pi64, i64 %dst_size, i8 %c) { + %dst = bitcast i64* %dst_pi64 to i8* + call void @llvm.memset.p0i8.i64(i8* %dst, i8 %c, i64 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_dst +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src, i64 %src_size, i1 false) +; CHECK-NEXT: ret void +define void @test_different_dst(i8* %dst2, i8* %src, i64 %src_size, i8* %dst, i64 %dst_size) { + call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src, i64 %src_size, i1 false) + ret void +} + +; Make sure we also take into account dependencies on the destination. + +; CHECK-LABEL: define i8 @test_intermediate_read +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false) +; CHECK-NEXT: %r = load i8, i8* %a +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 24, i1 false) +; CHECK-NEXT: ret i8 %r +define i8 @test_intermediate_read(i8* %a, i8* %b) #0 { + call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false) + %r = load i8, i8* %a + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 24, i1 false) + ret i8 %r +} + +%struct = type { [8 x i8], [8 x i8] } + +; CHECK-LABEL: define void @test_intermediate_write +; CHECK-NEXT: %a = alloca %struct +; CHECK-NEXT: %a0 = getelementptr %struct, %struct* %a, i32 0, i32 0, i32 0 +; CHECK-NEXT: %a1 = getelementptr %struct, %struct* %a, i32 0, i32 1, i32 0 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %a0, i8 0, i64 16, i1 false) +; CHECK-NEXT: store i8 1, i8* %a1 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a0, i8* %b, i64 8, i1 false) +; CHECK-NEXT: ret void +define void @test_intermediate_write(i8* %b) #0 { + %a = alloca %struct + %a0 = getelementptr %struct, %struct* %a, i32 0, i32 0, i32 0 + %a1 = getelementptr %struct, %struct* %a, i32 0, i32 1, i32 0 + call void @llvm.memset.p0i8.i64(i8* %a0, i8 0, i64 16, i1 false) + store i8 1, i8* %a1 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a0, i8* %b, i64 8, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) +declare void @llvm.memset.p0i8.i128(i8* nocapture, i8, i128, i1) +declare void @llvm.memcpy.p0i8.p0i8.i128(i8* nocapture, i8* nocapture readonly, i128, i1) diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll new file mode 100644 index 00000000000..e36389a128f --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll @@ -0,0 +1,101 @@ +; RUN: opt -memcpyopt -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define void @test( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst2, i8 %c, i64 128, i1 false) +; CHECK-NEXT: ret void +define void @test(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_smaller_memcpy( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i1 false) +; CHECK-NEXT: ret void +define void @test_smaller_memcpy(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_smaller_memset( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false) +; CHECK-NEXT: ret void +define void @test_smaller_memset(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_align_memset( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 128, i1 false) +; CHECK-NEXT: ret void +define void @test_align_memset(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dst2, i8 %c, i32 100, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst2, i8* %dst1, i32 100, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_types_2( +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 %dst1, i8 %c, i32 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i1 false) +; CHECK-NEXT: ret void +define void @test_different_types_2(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i32(i8* align 8 %dst1, i8 %c, i32 128, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_different_source_gep( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) +; CHECK-NEXT: %p = getelementptr i8, i8* %dst1, i64 64 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %p, i64 64, i1 false) +; CHECK-NEXT: ret void +define void @test_different_source_gep(i8* %dst1, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + ; FIXME: We could optimize this as well. + %p = getelementptr i8, i8* %dst1, i64 64 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %p, i64 64, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_variable_size_1( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false) +; CHECK-NEXT: ret void +define void @test_variable_size_1(i8* %dst1, i64 %dst1_size, i8* %dst2, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false) + ret void +} + +; CHECK-LABEL: define void @test_variable_size_2( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i1 false) +; CHECK-NEXT: ret void +define void @test_variable_size_2(i8* %dst1, i8* %dst2, i64 %dst2_size, i8 %c) { + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) diff --git a/llvm/test/Transforms/MemCpyOpt/nontemporal.ll b/llvm/test/Transforms/MemCpyOpt/nontemporal.ll new file mode 100644 index 00000000000..d9dafcc7b81 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/nontemporal.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Verify that we don't combine nontemporal stores into memset calls. + +define void @nontemporal_stores_1(<4 x float>* nocapture %dst) { +; CHECK-LABEL: @nontemporal_stores_1 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr1, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr2, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr3, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr4, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr5, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr6, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr7, align 16, !nontemporal !0 +; CHECK-NEXT: ret void +entry: + store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0 + %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 1 + store <4 x float> zeroinitializer, <4 x float>* %ptr1, align 16, !nontemporal !0 + %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 2 + store <4 x float> zeroinitializer, <4 x float>* %ptr2, align 16, !nontemporal !0 + %ptr3 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 3 + store <4 x float> zeroinitializer, <4 x float>* %ptr3, align 16, !nontemporal !0 + %ptr4 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 4 + store <4 x float> zeroinitializer, <4 x float>* %ptr4, align 16, !nontemporal !0 + %ptr5 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 5 + store <4 x float> zeroinitializer, <4 x float>* %ptr5, align 16, !nontemporal !0 + %ptr6 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 6 + store <4 x float> zeroinitializer, <4 x float>* %ptr6, align 16, !nontemporal !0 + %ptr7 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 7 + store <4 x float> zeroinitializer, <4 x float>* %ptr7, align 16, !nontemporal !0 + ret void +} + +define void @nontemporal_stores_2(<4 x float>* nocapture %dst) { +; CHECK-LABEL: @nontemporal_stores_2 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0 +; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr1, align 16, !nontemporal !0 +; CHECK-NEXT: ret void +entry: + store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0 + %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 1 + store <4 x float> zeroinitializer, <4 x float>* %ptr1, align 16, !nontemporal !0 + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/Transforms/MemCpyOpt/pr29105.ll b/llvm/test/Transforms/MemCpyOpt/pr29105.ll new file mode 100644 index 00000000000..e9e9b611aef --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/pr29105.ll @@ -0,0 +1,39 @@ +; RUN: opt -memcpyopt -instcombine -S %s | FileCheck %s +%Foo = type { [2048 x i64] } + +; Make sure that all mempcy calls are converted to memset calls, or removed. +; CHECK-LABEL: @baz( +; CHECK-NOT: call void @llvm.memcpy +define void @baz() unnamed_addr #0 { +entry-block: + %x.sroa.0 = alloca [2048 x i64], align 8 + %tmp0 = alloca [2048 x i64], align 8 + %0 = bitcast [2048 x i64]* %tmp0 to i8* + %tmp2 = alloca %Foo, align 8 + %x.sroa.0.0..sroa_cast6 = bitcast [2048 x i64]* %x.sroa.0 to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* %x.sroa.0.0..sroa_cast6) + call void @llvm.lifetime.start.p0i8(i64 16384, i8* %0) + call void @llvm.memset.p0i8.i64(i8* align 8 %0, i8 0, i64 16384, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %x.sroa.0.0..sroa_cast6, i8* align 8 %0, i64 16384, i1 false) + call void @llvm.lifetime.end.p0i8(i64 16384, i8* %0) + %1 = bitcast %Foo* %tmp2 to i8* + call void @llvm.lifetime.start.p0i8(i64 16384, i8* %1) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %x.sroa.0.0..sroa_cast6, i64 16384, i1 false) + call void @bar(%Foo* noalias nocapture nonnull dereferenceable(16384) %tmp2) + call void @llvm.lifetime.end.p0i8(i64 16384, i8* %1) + call void @llvm.lifetime.end.p0i8(i64 16384, i8* %x.sroa.0.0..sroa_cast6) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 + +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +declare void @bar(%Foo* noalias nocapture readonly dereferenceable(16384)) unnamed_addr #0 + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1 + +attributes #0 = { uwtable } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/MemCpyOpt/process_store.ll b/llvm/test/Transforms/MemCpyOpt/process_store.ll new file mode 100644 index 00000000000..e2edef0a94f --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/process_store.ll @@ -0,0 +1,39 @@ +; RUN: opt < %s -memcpyopt -disable-output + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common dso_local local_unnamed_addr global i32 0, align 4 +@a = common dso_local local_unnamed_addr global i32 0, align 4 + +declare dso_local i32 @f1() + +; Do not crash due to store first in BB. +define dso_local void @f2() { +for.end: + %0 = load i32, i32* @b, align 4 + ret void + +for.body: + store i32 %1, i32* @a, align 4 + %call = call i32 @f1() + %cmp = icmp sge i32 %call, 0 + %1 = load i32, i32* @b, align 4 + br label %for.body +} + +; Do not crash due to call not before store in BB. +define dso_local void @f3() { +for.end: + %0 = load i32, i32* @b, align 4 + ret void + +for.body: + %t = add i32 %t2, 1 + store i32 %1, i32* @a, align 4 + %call = call i32 @f1() + %cmp = icmp sge i32 %call, 0 + %1 = load i32, i32* @b, align 4 + %t2 = xor i32 %t, 5 + br label %for.body +} diff --git a/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll b/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll new file mode 100644 index 00000000000..649d2386f96 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll @@ -0,0 +1,20 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @foo( +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64(i8* align 2 %2, i8 0, i64 8, i1 false) + +define void @foo(i64* nocapture %P) { +entry: + %0 = bitcast i64* %P to i16* + %arrayidx = getelementptr inbounds i16, i16* %0, i64 1 + %1 = bitcast i16* %arrayidx to i32* + %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3 + store i16 0, i16* %0, align 2 + store i32 0, i32* %1, align 4 + store i16 0, i16* %arrayidx1, align 2 + ret void +} + diff --git a/llvm/test/Transforms/MemCpyOpt/smaller.ll b/llvm/test/Transforms/MemCpyOpt/smaller.ll new file mode 100644 index 00000000000..0c82b5201dc --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/smaller.ll @@ -0,0 +1,29 @@ +; RUN: opt -memcpyopt -S < %s | FileCheck %s +; RUN: opt -passes=memcpyopt -S < %s | FileCheck %s +; rdar://8875553 + +; Memcpyopt shouldn't optimize the second memcpy using the first +; because the first has a smaller size. + +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %tmp, i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 0, i32 0), i32 16, i1 false) + +target datalayout = "e-p:32:32:32" + +%struct.s = type { [11 x i8], i32 } + +@.str = private constant [11 x i8] c"0123456789\00" +@cell = external global %struct.s + +declare void @check(%struct.s* byval %p) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind + +define void @foo() nounwind { +entry: + %agg.tmp = alloca %struct.s, align 4 + store i32 99, i32* getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 1), align 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 0, i32 0), i8* align 1 getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 11, i1 false) + %tmp = getelementptr inbounds %struct.s, %struct.s* %agg.tmp, i32 0, i32 0, i32 0 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %tmp, i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 0, i32 0), i32 16, i1 false) + call void @check(%struct.s* byval %agg.tmp) + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/sret.ll b/llvm/test/Transforms/MemCpyOpt/sret.ll new file mode 100644 index 00000000000..a99b52d5d6e --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/sret.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -basicaa -memcpyopt -S | not grep "call.*memcpy" + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i686-apple-darwin9" + +%0 = type { x86_fp80, x86_fp80 } + +define void @ccosl(%0* noalias sret %agg.result, %0* byval align 8 %z) nounwind { +entry: + %iz = alloca %0 + %memtmp = alloca %0, align 16 + %tmp1 = getelementptr %0, %0* %z, i32 0, i32 1 + %tmp2 = load x86_fp80, x86_fp80* %tmp1, align 16 + %tmp3 = fsub x86_fp80 0xK80000000000000000000, %tmp2 + %tmp4 = getelementptr %0, %0* %iz, i32 0, i32 1 + %real = getelementptr %0, %0* %iz, i32 0, i32 0 + %tmp7 = getelementptr %0, %0* %z, i32 0, i32 0 + %tmp8 = load x86_fp80, x86_fp80* %tmp7, align 16 + store x86_fp80 %tmp3, x86_fp80* %real, align 16 + store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16 + call void @ccoshl(%0* noalias sret %memtmp, %0* byval align 8 %iz) nounwind + %memtmp14 = bitcast %0* %memtmp to i8* + %agg.result15 = bitcast %0* %agg.result to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %agg.result15, i8* align 16 %memtmp14, i32 32, i1 false) + ret void +} + +declare void @ccoshl(%0* noalias nocapture sret, %0* byval) nounwind + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind diff --git a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll new file mode 100644 index 00000000000..4bead3381cc --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll @@ -0,0 +1,74 @@ +; RUN: opt -S -memcpyopt < %s | FileCheck %s + +; PR40118: BasicAA didn't realize that stackrestore ends the lifetime of +; unescaped dynamic allocas, such as those that might come from inalloca. + +source_filename = "t.cpp" +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-unknown-windows-msvc19.14.26433" + +@str = internal constant [9 x i8] c"abcdxxxxx" + + +; Test that we can propagate memcpy through an unescaped dynamic alloca across +; a call to @external. + +define i32 @test_norestore(i32 %n) { + %tmpmem = alloca [10 x i8], align 4 + %tmp = getelementptr inbounds [10 x i8], [10 x i8]* %tmpmem, i32 0, i32 0 + + ; Make a dynamic alloca, initialize it. + %p = alloca i8, i32 %n, align 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false) + + ; This extra byte exists to prevent memcpyopt from propagating @str. + %p10 = getelementptr inbounds i8, i8* %p, i32 9 + store i8 0, i8* %p10 + + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* %p, i32 10, i1 false) + call void @external() + %heap = call i8* @malloc(i32 9) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %heap, i8* %tmp, i32 9, i1 false) + call void @useit(i8* %heap) + ret i32 0 +} + +; CHECK-LABEL: define i32 @test_norestore(i32 %n) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* %p, i32 10, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %heap, i8* %p, i32 9, i1 false) + + +; Do not propagate memcpy from %p across the stackrestore. + +define i32 @test_stackrestore() { + %tmpmem = alloca [10 x i8], align 4 + %tmp = getelementptr inbounds [10 x i8], [10 x i8]* %tmpmem, i32 0, i32 0 + %inalloca.save = tail call i8* @llvm.stacksave() + %argmem = alloca inalloca [10 x i8], align 4 + %p = getelementptr inbounds [10 x i8], [10 x i8]* %argmem, i32 0, i32 0 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false) + + ; This extra byte exists to prevent memcpyopt from propagating @str. + %p10 = getelementptr inbounds [10 x i8], [10 x i8]* %argmem, i32 0, i32 9 + store i8 0, i8* %p10 + + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* %p, i32 10, i1 false) + call void @llvm.stackrestore(i8* %inalloca.save) + %heap = call i8* @malloc(i32 9) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %heap, i8* %tmp, i32 9, i1 false) + call void @useit(i8* %heap) + ret i32 0 +} + +; CHECK-LABEL: define i32 @test_stackrestore() +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* %p, i32 10, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %heap, i8* %tmp, i32 9, i1 false) + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1) +declare i8* @llvm.stacksave() +declare void @llvm.stackrestore(i8*) +declare i8* @malloc(i32) +declare void @useit(i8*) +declare void @external() |