[AArch64][GlobalISel] Inline tiny memcpy et al at -O0.

FastISel already does this since the initial arm64 port was upstreamed, so it seems there are no issues with doing this at -O0 for very small memcpys. Gives a 0.2% geomean code size improvement on CTMark. Differential Revision: https://reviews.llvm.org/D65758 llvm-svn: 367919
author: Amara Emerson <aemerson@apple.com> 2019-08-05 20:02:52 +0000
committer: Amara Emerson <aemerson@apple.com> 2019-08-05 20:02:52 +0000
commit: 85e5e28ab4c826593610e25aac7197a35da8244c (patch)
tree: cc4baa8bdb74698ed790b524f840aa99c5652df0
parent: 6e33c647f3077d91079bf4c33d03acda47a55a1c (diff)
download: bcm5719-llvm-85e5e28ab4c826593610e25aac7197a35da8244c.tar.gz
bcm5719-llvm-85e5e28ab4c826593610e25aac7197a35da8244c.zip
4 files changed, 97 insertions, 5 deletions
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 8399e1b453b..ee1582ff33c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -66,8 +66,8 @@ public:
   bool tryCombineBr(MachineInstr &MI);
 
   /// Optimize memcpy intrinsics et al, e.g. constant len calls.
-  /// 
-  bool tryCombineMemCpyFamily(MachineInstr &MI);
+  /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline.
+  bool tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0);
 
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 373d52fd1d4..91498133faf 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -861,7 +861,7 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
   return true;
 }
 
-bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI) {
+bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
   // This combine is fairly complex so it's not written with a separate
   // matcher function.
   assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
@@ -900,6 +900,9 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI) {
     return true;
   }
 
+  if (MaxLen && KnownLen > MaxLen)
+    return false;
+
   if (ID == Intrinsic::memcpy)
     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
   if (ID == Intrinsic::memmove)
diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
index 5ec209ada17..835fcf09f0a 100644
--- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
@@ -56,9 +56,12 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
     case Intrinsic::memset: {
+      // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
+      // heuristics decide.
+      unsigned MaxLen = EnableOpt ? 0 : 32;
       // Try to inline memcpy type calls if optimizations are enabled.
-      return (EnableOpt && !EnableOptSize) ? Helper.tryCombineMemCpyFamily(MI)
-                                           : false;
+      return (!EnableOptSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen)
+                              : false;
     }
     default:
       break;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir
new file mode 100644
index 00000000000..2eff8b8be8b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=aarch64-prelegalizer-combiner -O0 -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "arm64-apple-darwin"
+
+  declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg) #1
+
+  define void @test_small_memcpy(i32* nocapture %dst, i32* nocapture readonly %src) {
+  entry:
+    %0 = bitcast i32* %dst to i8*
+    %1 = bitcast i32* %src to i8*
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 32, i1 false)
+    ret void
+  }
+
+  define void @test_large_memcpy(i32* nocapture %dst, i32* nocapture readonly %src) {
+  entry:
+    %0 = bitcast i32* %dst to i8*
+    %1 = bitcast i32* %src to i8*
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 36, i1 false)
+    ret void
+  }
+
+  attributes #1 = { argmemonly nounwind }
+
+...
+---
+name:            test_small_memcpy
+alignment:       2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: test_small_memcpy
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.1, align 4)
+    ; CHECK: G_STORE [[LOAD]](s128), [[COPY]](p0) :: (store 16 into %ir.0, align 4)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p0) :: (load 16 from %ir.1 + 16, align 4)
+    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+    ; CHECK: G_STORE [[LOAD1]](s128), [[GEP1]](p0) :: (store 16 into %ir.0 + 16, align 4)
+    ; CHECK: RET_ReallyLR
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(s64) = G_CONSTANT i64 32
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4)
+    RET_ReallyLR
+
+...
+---
+name:            test_large_memcpy
+alignment:       2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: test_large_memcpy
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY]](p0), [[COPY1]](p0), [[C]](s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4)
+    ; CHECK: RET_ReallyLR
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(s64) = G_CONSTANT i64 36
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4)
+    RET_ReallyLR
+
+...
author	Amara Emerson <aemerson@apple.com>	2019-08-05 20:02:52 +0000
committer	Amara Emerson <aemerson@apple.com>	2019-08-05 20:02:52 +0000
commit	85e5e28ab4c826593610e25aac7197a35da8244c (patch)
tree	cc4baa8bdb74698ed790b524f840aa99c5652df0
parent	6e33c647f3077d91079bf4c33d03acda47a55a1c (diff)
download	bcm5719-llvm-85e5e28ab4c826593610e25aac7197a35da8244c.tar.gz bcm5719-llvm-85e5e28ab4c826593610e25aac7197a35da8244c.zip