diff options
author | JF Bastien <jfbastien@apple.com> | 2018-09-06 16:03:32 +0000 |
---|---|---|
committer | JF Bastien <jfbastien@apple.com> | 2018-09-06 16:03:32 +0000 |
commit | 29200611055f49a0d37243caa5f8bba1df9d57a6 (patch) | |
tree | ce84c79f58ff029187b4eb2eea291d65f71742a0 /llvm/test/CodeGen/AArch64/arm64-memset-inline.ll | |
parent | 99d732052ff2282e3c7b3e5c7f29c217ff4f7316 (diff) | |
download | bcm5719-llvm-29200611055f49a0d37243caa5f8bba1df9d57a6.tar.gz bcm5719-llvm-29200611055f49a0d37243caa5f8bba1df9d57a6.zip |
ARM64: improve non-zero memset isel by ~2x
Summary:
I added a few ARM64 memset codegen tests in r341406 and r341493, and annotated
where the generated code was bad. This patch fixes the majority of the issues by
requesting that a 2xi64 vector be used for memset of 32 bytes and above.
The patch leaves the former request for f128 unchanged, despite f128
materialization being suboptimal: doing otherwise runs into other asserts in
isel and makes this patch too broad.
This patch hides the issue that was present in bzero_40_stack and bzero_72_stack
because the code now generates in a better order which doesn't have the store
offset issue. I'm not aware of that issue appearing elsewhere at the moment.
<rdar://problem/44157755>
Reviewers: t.p.northover, MatzeB, javed.absar
Subscribers: eraman, kristof.beyls, chrib, dexonsmith, llvm-commits
Differential Revision: https://reviews.llvm.org/D51706
llvm-svn: 341558
Diffstat (limited to 'llvm/test/CodeGen/AArch64/arm64-memset-inline.ll')
-rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-memset-inline.ll | 104 |
1 files changed, 37 insertions, 67 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll index b22e3c0c0b0..8946d8db331 100644 --- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll +++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll @@ -137,14 +137,12 @@ define void @bzero_32_stack() { ret void } -; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store. define void @bzero_40_stack() { ; CHECK-LABEL: bzero_40_stack: -; CHECK: stp xzr, x30, [sp, #40] -; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stur q0, [sp, #24] -; CHECK-NEXT: stur q0, [sp, #8] +; CHECK: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str xzr, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [40 x i8], align 1 %cast = bitcast [40 x i8]* %buf to i8* @@ -167,16 +165,13 @@ define void @bzero_64_stack() { ret void } -; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store. define void @bzero_72_stack() { ; CHECK-LABEL: bzero_72_stack: -; CHECK: stp xzr, x30, [sp, #72] ; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: x0, sp, #8 -; CHECK-NEXT: stur q0, [sp, #56] -; CHECK-NEXT: stur q0, [sp, #40] -; CHECK-NEXT: stur q0, [sp, #24] -; CHECK-NEXT: stur q0, [sp, #8] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str xzr, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [72 x i8], align 1 %cast = bitcast [72 x i8]* %buf to i8* @@ -310,14 +305,11 @@ define void @memset_26_stack() { ret void } -; FIXME This could use FP ops. define void @memset_32_stack() { ; CHECK-LABEL: memset_32_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [32 x i8], align 1 %cast = bitcast [32 x i8]* %buf to i8* @@ -326,14 +318,13 @@ define void @memset_32_stack() { ret void } -; FIXME This could use FP ops. define void @memset_40_stack() { ; CHECK-LABEL: memset_40_stack: ; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stp x8, x30, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] +; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x8, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [40 x i8], align 1 %cast = bitcast [40 x i8]* %buf to i8* @@ -342,16 +333,12 @@ define void @memset_40_stack() { ret void } -; FIXME This could use FP ops. define void @memset_64_stack() { ; CHECK-LABEL: memset_64_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [64 x i8], align 1 %cast = bitcast [64 x i8]* %buf to i8* @@ -360,16 +347,14 @@ define void @memset_64_stack() { ret void } -; FIXME This could use FP ops. define void @memset_72_stack() { ; CHECK-LABEL: memset_72_stack: ; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stp x8, x30, [sp, #72] -; CHECK-NEXT: stp x8, x8, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] +; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x8, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [72 x i8], align 1 %cast = bitcast [72 x i8]* %buf to i8* @@ -378,20 +363,14 @@ define void @memset_72_stack() { ret void } -; FIXME This could use FP ops. define void @memset_128_stack() { ; CHECK-LABEL: memset_128_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #120] -; CHECK-NEXT: stp x8, x8, [sp, #104] -; CHECK-NEXT: stp x8, x8, [sp, #88] -; CHECK-NEXT: stp x8, x8, [sp, #72] -; CHECK-NEXT: stp x8, x8, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp, #96] +; CHECK-NEXT: stp q0, q0, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [128 x i8], align 1 %cast = bitcast [128 x i8]* %buf to i8* @@ -400,27 +379,18 @@ define void @memset_128_stack() { ret void } -; FIXME This could use FP ops. define void @memset_256_stack() { ; CHECK-LABEL: memset_256_stack: -; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x8, [sp, #240] -; CHECK-NEXT: stp x8, x8, [sp, #224] -; CHECK-NEXT: stp x8, x8, [sp, #208] -; CHECK-NEXT: stp x8, x8, [sp, #192] -; CHECK-NEXT: stp x8, x8, [sp, #176] -; CHECK-NEXT: stp x8, x8, [sp, #160] -; CHECK-NEXT: stp x8, x8, [sp, #144] -; CHECK-NEXT: stp x8, x8, [sp, #128] -; CHECK-NEXT: stp x8, x8, [sp, #112] -; CHECK-NEXT: stp x8, x8, [sp, #96] -; CHECK-NEXT: stp x8, x8, [sp, #80] -; CHECK-NEXT: stp x8, x8, [sp, #64] -; CHECK-NEXT: stp x8, x8, [sp, #48] -; CHECK-NEXT: stp x8, x8, [sp, #32] -; CHECK-NEXT: stp x8, x8, [sp, #16] -; CHECK-NEXT: stp x8, x8, [sp] +; CHECK: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: stp q0, q0, [sp, #224] +; CHECK-NEXT: stp q0, q0, [sp, #192] +; CHECK-NEXT: stp q0, q0, [sp, #160] +; CHECK-NEXT: stp q0, q0, [sp, #128] +; CHECK-NEXT: stp q0, q0, [sp, #96] +; CHECK-NEXT: stp q0, q0, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [256 x i8], align 1 %cast = bitcast [256 x i8]* %buf to i8* |