diff options
| author | Tim Northover <tnorthover@apple.com> | 2014-03-29 10:18:08 +0000 | 
|---|---|---|
| committer | Tim Northover <tnorthover@apple.com> | 2014-03-29 10:18:08 +0000 | 
| commit | 00ed9964c65962e2afc8e3c83a2f7114b0ce25a0 (patch) | |
| tree | 21404a5e99549c2f98d72eac05f9df0b484d2d28 /llvm/test/Transforms | |
| parent | 3e38d290c872ae8d875b8dbe2d55262cee3a3cf9 (diff) | |
| download | bcm5719-llvm-00ed9964c65962e2afc8e3c83a2f7114b0ce25a0.tar.gz bcm5719-llvm-00ed9964c65962e2afc8e3c83a2f7114b0ce25a0.zip  | |
ARM64: initial backend import
This adds a second implementation of the AArch64 architecture to LLVM,
accessible in parallel via the "arm64" triple. The plan over the
coming weeks & months is to merge the two into a single backend,
during which time thorough code review should naturally occur.
Everything will be easier with the target in-tree though, hence this
commit.
llvm-svn: 205090
Diffstat (limited to 'llvm/test/Transforms')
11 files changed, 478 insertions, 3 deletions
diff --git a/llvm/test/Transforms/GlobalMerge/ARM/arm.ll b/llvm/test/Transforms/GlobalMerge/ARM/arm.ll new file mode 100644 index 00000000000..8c77de62ece --- /dev/null +++ b/llvm/test/Transforms/GlobalMerge/ARM/arm.ll @@ -0,0 +1,85 @@ +; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s +; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s + +; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2 +; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4 +; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2 + +; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4 +; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2 +; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2 +; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2 +; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4 + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +@bar = internal global [5 x i32] zeroinitializer, align 4 +@baz = internal global [5 x i32] zeroinitializer, align 4 +@foo = internal global [5 x i32] zeroinitializer, align 4 + +; Function Attrs: nounwind ssp +define internal void @initialize() #0 { +  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 0), align 4, !tbaa !1 +  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 0), align 4, !tbaa !1 +  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 1), align 4, !tbaa !1 +  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 1), align 4, !tbaa !1 +  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 2), align 4, !tbaa !1 +  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 2), align 4, !tbaa !1 +  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 3), align 4, !tbaa !1 +  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 3), align 4, !tbaa !1 +  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 4), align 4, !tbaa !1 +  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3 +  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 4), align 4, !tbaa !1 +  ret void +} + +declare i32 @calc(...) #1 + +; Function Attrs: nounwind ssp +define internal void @calculate() #0 { +  %1 = load <4 x i32>* bitcast ([5 x i32]* @bar to <4 x i32>*), align 4 +  %2 = load <4 x i32>* bitcast ([5 x i32]* @baz to <4 x i32>*), align 4 +  %3 = mul <4 x i32> %2, %1 +  store <4 x i32> %3, <4 x i32>* bitcast ([5 x i32]* @foo to <4 x i32>*), align 4 +  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 4), align 4, !tbaa !1 +  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 4), align 4, !tbaa !1 +  %6 = mul nsw i32 %5, %4 +  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i32 0, i32 4), align 4, !tbaa !1 +  ret void +} + +; Function Attrs: nounwind readnone ssp +define internal i32* @returnFoo() #2 { +  ret i32* getelementptr inbounds ([5 x i32]* @foo, i32 0, i32 0) +} + +attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"LLVM version 3.4 "} +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"int", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/llvm/test/Transforms/GlobalMerge/ARM/lit.local.cfg b/llvm/test/Transforms/GlobalMerge/ARM/lit.local.cfg new file mode 100644 index 00000000000..8a3ba96497e --- /dev/null +++ b/llvm/test/Transforms/GlobalMerge/ARM/lit.local.cfg @@ -0,0 +1,4 @@ +targets = set(config.root.targets_to_build.split()) +if not 'ARM' in targets: +    config.unsupported = True + diff --git a/llvm/test/Transforms/GlobalMerge/ARM64/arm64.ll b/llvm/test/Transforms/GlobalMerge/ARM64/arm64.ll new file mode 100644 index 00000000000..eea474a74f1 --- /dev/null +++ b/llvm/test/Transforms/GlobalMerge/ARM64/arm64.ll @@ -0,0 +1,88 @@ +; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s +; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s +; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s + +; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2 +; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4 +; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2 +; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2 + +; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4 +; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2 +; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2 +; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2 +; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" +target triple = "arm64-apple-ios7.0.0" + +@bar = internal global [5 x i32] zeroinitializer, align 4 +@baz = internal global [5 x i32] zeroinitializer, align 4 +@foo = internal global [5 x i32] zeroinitializer, align 4 + +; Function Attrs: nounwind ssp +define internal void @initialize() #0 { +  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4 +  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4 +  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4 +  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4 +  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4 +  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4 +  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4 +  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4 +  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4 +  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2 +  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4 +  ret void +} + +declare i32 @calc(...) + +; Function Attrs: nounwind ssp +define internal void @calculate() #0 { +  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4 +  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4 +  %3 = mul nsw i32 %2, %1 +  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4 +  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4 +  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4 +  %6 = mul nsw i32 %5, %4 +  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4 +  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4 +  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4 +  %9 = mul nsw i32 %8, %7 +  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4 +  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4 +  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4 +  %12 = mul nsw i32 %11, %10 +  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4 +  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4 +  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4 +  %15 = mul nsw i32 %14, %13 +  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4 +  ret void +} + +; Function Attrs: nounwind readnone ssp +define internal i32* @returnFoo() #1 { +  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0) +} + +attributes #0 = { nounwind ssp } +attributes #1 = { nounwind readnone ssp } +attributes #2 = { nounwind } diff --git a/llvm/test/Transforms/GlobalMerge/ARM64/lit.local.cfg b/llvm/test/Transforms/GlobalMerge/ARM64/lit.local.cfg new file mode 100644 index 00000000000..a75a42b6f74 --- /dev/null +++ b/llvm/test/Transforms/GlobalMerge/ARM64/lit.local.cfg @@ -0,0 +1,4 @@ +targets = set(config.root.targets_to_build.split()) +if not 'ARM64' in targets: +    config.unsupported = True + diff --git a/llvm/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/llvm/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll index 2dedd44e2be..1883a8fc8e6 100644 --- a/llvm/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll @@ -1,6 +1,3 @@ -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" -target triple = "thumbv7-apple-ios0" -  ; RUN: opt -S -instcombine < %s | FileCheck %s  define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp { @@ -67,6 +64,72 @@ entry:  declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone  declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +; ARM64 variants - <rdar://problem/12349617> + +define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind +  ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> zeroinitializer +} + +define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind +  ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %a +} + +define <4 x i32> @constantMulARM64() nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind +  ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6> +} + +define <4 x i32> @constantMulSARM64() nounwind readnone ssp { +entry: +  %b = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind +  ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> +} + +define <4 x i32> @constantMulUARM64() nounwind readnone ssp { +entry: +  %b = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind +  ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> +} + +define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind +  %b = add <4 x i32> zeroinitializer, %a +  ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]] +; CHECK-NEXT: ret <4 x i32> %a +} + +define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind +  %b = add <4 x i32> %x, %a +  ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6> +; CHECK-NEXT: ret <4 x i32> %b +} + +declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +  ; CHECK: attributes #0 = { nounwind readnone ssp }  ; CHECK: attributes #1 = { nounwind readnone }  ; CHECK: attributes [[NUW]] = { nounwind } diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll index c810ae475a4..739827f1962 100644 --- a/llvm/test/Transforms/InstCombine/sincospi.ll +++ b/llvm/test/Transforms/InstCombine/sincospi.ll @@ -1,5 +1,6 @@  ; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC  ; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s +; RUN: opt -instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s  ; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefix=CHECK-NO-SINCOS  ; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefix=CHECK-NO-SINCOS  ; RUN: opt -instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-NO-SINCOS diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg b/llvm/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg new file mode 100644 index 00000000000..a49957999f0 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg @@ -0,0 +1,5 @@ +config.suffixes = ['.ll'] + +targets = set(config.root.targets_to_build.split()) +if not 'ARM64' in targets: +    config.unsupported = True diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll new file mode 100644 index 00000000000..16f6afa6f52 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=arm64 -mcpu=cyclone -pre-RA-sched=list-hybrid < %s | FileCheck %s +; rdar://10232252 +; Prevent LSR of doing poor choice that cannot be folded in addressing mode + +; Remove the -pre-RA-sched=list-hybrid option after fixing: +; <rdar://problem/12702735> [ARM64][coalescer] need better register +; coalescing for simple unit tests. + +; CHECK: testCase +; CHECK: %while.body{{$}} +; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}], #8 +; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}], #8 +; CHECK: %while.end +define i32 @testCase() nounwind ssp { +entry: +  br label %while.body + +while.body:                                       ; preds = %while.body, %entry +  %len.06 = phi i64 [ 1288, %entry ], [ %sub, %while.body ] +  %pDst.05 = phi i64* [ inttoptr (i64 6442450944 to i64*), %entry ], [ %incdec.ptr1, %while.body ] +  %pSrc.04 = phi i64* [ inttoptr (i64 4294967296 to i64*), %entry ], [ %incdec.ptr, %while.body ] +  %incdec.ptr = getelementptr inbounds i64* %pSrc.04, i64 1 +  %tmp = load volatile i64* %pSrc.04, align 8 +  %incdec.ptr1 = getelementptr inbounds i64* %pDst.05, i64 1 +  store volatile i64 %tmp, i64* %pDst.05, align 8 +  %sub = add i64 %len.06, -8 +  %cmp = icmp sgt i64 %sub, -1 +  br i1 %cmp, label %while.body, label %while.end + +while.end:                                        ; preds = %while.body +  tail call void inttoptr (i64 6442450944 to void ()*)() nounwind +  ret i32 0 +} diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll new file mode 100644 index 00000000000..19208025a45 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll @@ -0,0 +1,101 @@ +; RUN: llc < %s -O3 -march=arm64 -mcpu=cyclone -pre-RA-sched=list-hybrid | FileCheck %s +; <rdar://problem/11635990> [arm64] [lsr] Inefficient EA/loop-exit calc in bzero_phys +; +; LSR on loop %while.cond should reassociate non-address mode +; expressions at use %cmp16 to avoid sinking computation into %while.body18. +; +; Remove the -pre-RA-sched=list-hybrid option after fixing: +; <rdar://problem/12702735> [ARM64][coalescer] need better register +; coalescing for simple unit tests. + +; CHECK: @memset +; CHECK: %while.body18{{$}} +; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #8 +; First set the IVREG variable, then use it +; CHECK-NEXT: sub [[IVREG:x[0-9]+]], +; CHECK: [[IVREG]], #8 +; CHECK-NEXT: cmp  [[IVREG]], #7 +; CHECK-NEXT: b.hi +define i8* @memset(i8* %dest, i32 %val, i64 %len) nounwind ssp noimplicitfloat { +entry: +  %cmp = icmp eq i64 %len, 0 +  br i1 %cmp, label %done, label %while.cond.preheader + +while.cond.preheader:                             ; preds = %entry +  %conv = trunc i32 %val to i8 +  br label %while.cond + +while.cond:                                       ; preds = %while.body, %while.cond.preheader +  %ptr.0 = phi i8* [ %incdec.ptr, %while.body ], [ %dest, %while.cond.preheader ] +  %len.addr.0 = phi i64 [ %dec, %while.body ], [ %len, %while.cond.preheader ] +  %cond = icmp eq i64 %len.addr.0, 0 +  br i1 %cond, label %done, label %land.rhs + +land.rhs:                                         ; preds = %while.cond +  %0 = ptrtoint i8* %ptr.0 to i64 +  %and = and i64 %0, 7 +  %cmp5 = icmp eq i64 %and, 0 +  br i1 %cmp5, label %if.end9, label %while.body + +while.body:                                       ; preds = %land.rhs +  %incdec.ptr = getelementptr inbounds i8* %ptr.0, i64 1 +  store i8 %conv, i8* %ptr.0, align 1, !tbaa !0 +  %dec = add i64 %len.addr.0, -1 +  br label %while.cond + +if.end9:                                          ; preds = %land.rhs +  %conv.mask = and i32 %val, 255 +  %1 = zext i32 %conv.mask to i64 +  %2 = shl nuw nsw i64 %1, 8 +  %ins18 = or i64 %2, %1 +  %3 = shl nuw nsw i64 %1, 16 +  %ins15 = or i64 %ins18, %3 +  %4 = shl nuw nsw i64 %1, 24 +  %5 = shl nuw nsw i64 %1, 32 +  %mask8 = or i64 %ins15, %4 +  %6 = shl nuw nsw i64 %1, 40 +  %mask5 = or i64 %mask8, %5 +  %7 = shl nuw nsw i64 %1, 48 +  %8 = shl nuw i64 %1, 56 +  %mask2.masked = or i64 %mask5, %6 +  %mask = or i64 %mask2.masked, %7 +  %ins = or i64 %mask, %8 +  %9 = bitcast i8* %ptr.0 to i64* +  %cmp1636 = icmp ugt i64 %len.addr.0, 7 +  br i1 %cmp1636, label %while.body18, label %while.body29.lr.ph + +while.body18:                                     ; preds = %if.end9, %while.body18 +  %wideptr.038 = phi i64* [ %incdec.ptr19, %while.body18 ], [ %9, %if.end9 ] +  %len.addr.137 = phi i64 [ %sub, %while.body18 ], [ %len.addr.0, %if.end9 ] +  %incdec.ptr19 = getelementptr inbounds i64* %wideptr.038, i64 1 +  store i64 %ins, i64* %wideptr.038, align 8, !tbaa !2 +  %sub = add i64 %len.addr.137, -8 +  %cmp16 = icmp ugt i64 %sub, 7 +  br i1 %cmp16, label %while.body18, label %while.end20 + +while.end20:                                      ; preds = %while.body18 +  %cmp21 = icmp eq i64 %sub, 0 +  br i1 %cmp21, label %done, label %while.body29.lr.ph + +while.body29.lr.ph:                               ; preds = %while.end20, %if.end9 +  %len.addr.1.lcssa49 = phi i64 [ %sub, %while.end20 ], [ %len.addr.0, %if.end9 ] +  %wideptr.0.lcssa48 = phi i64* [ %incdec.ptr19, %while.end20 ], [ %9, %if.end9 ] +  %10 = bitcast i64* %wideptr.0.lcssa48 to i8* +  br label %while.body29 + +while.body29:                                     ; preds = %while.body29, %while.body29.lr.ph +  %len.addr.235 = phi i64 [ %len.addr.1.lcssa49, %while.body29.lr.ph ], [ %dec26, %while.body29 ] +  %ptr.134 = phi i8* [ %10, %while.body29.lr.ph ], [ %incdec.ptr31, %while.body29 ] +  %dec26 = add i64 %len.addr.235, -1 +  %incdec.ptr31 = getelementptr inbounds i8* %ptr.134, i64 1 +  store i8 %conv, i8* %ptr.134, align 1, !tbaa !0 +  %cmp27 = icmp eq i64 %dec26, 0 +  br i1 %cmp27, label %done, label %while.body29 + +done:                                             ; preds = %while.cond, %while.body29, %while.end20, %entry +  ret i8* %dest +} + +!0 = metadata !{metadata !"omnipotent char", metadata !1} +!1 = metadata !{metadata !"Simple C/C++ TBAA"} +!2 = metadata !{metadata !"long long", metadata !0} diff --git a/llvm/test/Transforms/LoopVectorize/ARM64/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM64/gather-cost.ll new file mode 100644 index 00000000000..bb285382e53 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM64/gather-cost.ll @@ -0,0 +1,85 @@ +; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" + +@kernel = global [512 x float] zeroinitializer, align 16 +@kernel2 = global [512 x float] zeroinitializer, align 16 +@kernel3 = global [512 x float] zeroinitializer, align 16 +@kernel4 = global [512 x float] zeroinitializer, align 16 +@src_data = global [1536 x float] zeroinitializer, align 16 +@r_ = global i8 0, align 1 +@g_ = global i8 0, align 1 +@b_ = global i8 0, align 1 + +; We don't want to vectorize most loops containing gathers because they are +; expensive. +; Make sure we don't vectorize it. +; CHECK-NOT: x float> + +define void @_Z4testmm(i64 %size, i64 %offset) { +entry: +  %cmp53 = icmp eq i64 %size, 0 +  br i1 %cmp53, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: +  br label %for.body + +for.body: +  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ] +  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ] +  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] +  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ] +  %add = add i64 %v.055, %offset +  %mul = mul i64 %add, 3 +  %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul +  %0 = load float* %arrayidx, align 4 +  %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055 +  %1 = load float* %arrayidx2, align 4 +  %mul3 = fmul fast float %0, %1 +  %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055 +  %2 = load float* %arrayidx4, align 4 +  %mul5 = fmul fast float %mul3, %2 +  %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055 +  %3 = load float* %arrayidx6, align 4 +  %mul7 = fmul fast float %mul5, %3 +  %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055 +  %4 = load float* %arrayidx8, align 4 +  %mul9 = fmul fast float %mul7, %4 +  %add10 = fadd fast float %r.057, %mul9 +  %arrayidx.sum = add i64 %mul, 1 +  %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum +  %5 = load float* %arrayidx11, align 4 +  %mul13 = fmul fast float %1, %5 +  %mul15 = fmul fast float %2, %mul13 +  %mul17 = fmul fast float %3, %mul15 +  %mul19 = fmul fast float %4, %mul17 +  %add20 = fadd fast float %g.056, %mul19 +  %arrayidx.sum52 = add i64 %mul, 2 +  %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52 +  %6 = load float* %arrayidx21, align 4 +  %mul23 = fmul fast float %1, %6 +  %mul25 = fmul fast float %2, %mul23 +  %mul27 = fmul fast float %3, %mul25 +  %mul29 = fmul fast float %4, %mul27 +  %add30 = fadd fast float %b.054, %mul29 +  %inc = add i64 %v.055, 1 +  %exitcond = icmp ne i64 %inc, %size +  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: +  %add30.lcssa = phi float [ %add30, %for.body ] +  %add20.lcssa = phi float [ %add20, %for.body ] +  %add10.lcssa = phi float [ %add10, %for.body ] +  %phitmp = fptoui float %add10.lcssa to i8 +  %phitmp60 = fptoui float %add20.lcssa to i8 +  %phitmp61 = fptoui float %add30.lcssa to i8 +  br label %for.end + +for.end: +  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] +  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ] +  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ] +  store i8 %r.0.lcssa, i8* @r_, align 1 +  store i8 %g.0.lcssa, i8* @g_, align 1 +  store i8 %b.0.lcssa, i8* @b_, align 1 +  ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM64/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/ARM64/lit.local.cfg new file mode 100644 index 00000000000..de86e548526 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM64/lit.local.cfg @@ -0,0 +1,6 @@ +config.suffixes = ['.ll', '.c', '.cpp'] + +targets = set(config.root.targets_to_build.split()) +if not 'ARM64' in targets: +    config.unsupported = True +  | 

