ARM: Introduce conservative load/store optimization mode

Most of the time ARM has the CCR.UNALIGN_TRP bit set to false which means that unaligned loads/stores do not trap and even extensive testing will not catch these bugs. However the multi/double variants are not affected by this bit and will still trap. In effect a more aggressive load/store optimization will break existing (bad) code. These bugs do not necessarily manifest in the broken code where the misaligned pointer is formed but often later in perfectly legal code where it is accessed. This means recompiling system libraries (which have no alignment bugs) with a newer compiler will break existing applications (with alignment bugs) that worked before. So (under protest) I implemented this safe mode which limits the formation of multi/double operations to cases that are not affected by user code (stack operations like spills/reloads) or cases where the normal operations trap anyway (floating point load/stores). It is disabled by default. Differential Revision: http://reviews.llvm.org/D17015 llvm-svn: 262504
author: Matthias Braun <matze@braunis.de> 2016-03-02 19:20:00 +0000
committer: Matthias Braun <matze@braunis.de> 2016-03-02 19:20:00 +0000
commit: f290912d2248687e688779d1d89999df56c14a09 (patch)
tree: 3fcf0e6854ae16ba357b814c0a0a16f1a88f313c
parent: 578787ad3055d0a874bb76b148a9d5dc7bd18db5 (diff)
download: bcm5719-llvm-f290912d2248687e688779d1d89999df56c14a09.tar.gz
bcm5719-llvm-f290912d2248687e688779d1d89999df56c14a09.zip
4 files changed, 111 insertions, 31 deletions
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 5ee6641720e..cc49f9d549b 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -60,6 +60,15 @@ STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
 
+/// This switch disables formation of double/multi instructions that could
+/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
+/// disabled. This can be used to create libraries that are robust even when
+/// users provoke undefined behaviour by supplying misaligned pointers.
+/// \see mayCombineMisaligned()
+static cl::opt<bool>
+AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
+    cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
+
 namespace llvm {
 void initializeARMLoadStoreOptPass(PassRegistry &);
 }
@@ -916,6 +925,24 @@ static bool isValidLSDoubleOffset(int Offset) {
   return (Value % 4) == 0 && Value < 1024;
 }
 
+/// Return true for loads/stores that can be combined to a double/multi
+/// operation without increasing the requirements for alignment.
+static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
+                                 const MachineInstr &MI) {
+  // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
+  // difference.
+  unsigned Opcode = MI.getOpcode();
+  if (!isi32Load(Opcode) && !isi32Store(Opcode))
+    return true;
+
+  // Stack pointer alignment is out of the programmers control so we can trust
+  // SP-relative loads/stores.
+  if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
+      STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+    return true;
+  return false;
+}
+
 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
   const MachineInstr *FirstMI = MemOps[0].MI;
@@ -954,6 +981,10 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     if (PReg == ARM::SP || PReg == ARM::PC)
       CanMergeToLSMulti = CanMergeToLSDouble = false;
 
+    // Should we be conservative?
+    if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
     // Merge following instructions where possible.
     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
       int NewOffset = MemOps[I].Offset;
@@ -1926,6 +1957,9 @@ INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
                 ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
 
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (AssumeMisalignedLoadStores)
+    return false;
+
   TD = &Fn.getDataLayout();
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TII = STI->getInstrInfo();
diff --git a/llvm/test/CodeGen/ARM/ldrd.ll b/llvm/test/CodeGen/ARM/ldrd.ll
index b2596346bfa..dd97fbfd640 100644
--- a/llvm/test/CodeGen/ARM/ldrd.ll
+++ b/llvm/test/CodeGen/ARM/ldrd.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK -check-prefix=NORMAL
 ; rdar://6949835
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK -check-prefix=NORMAL
+
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=CHECK -check-prefix=CONSERVATIVE
 
 ; Magic ARM pair hints works best with linearscan / fast.
 
@@ -15,12 +17,13 @@ declare void @use_i64(i64 %v)
 
 define void @test_ldrd(i64 %a) nounwind readonly {
 ; CHECK-LABEL: test_ldrd:
-; CHECK: bl{{x?}} _get_ptr
+; NORMAL: bl{{x?}} _get_ptr
 ; A8: ldrd r0, r1, [r0]
 ; Cortex-M3 errata 602117: LDRD with base in list may result in incorrect base
 ; register when interrupted or faulted.
 ; M3-NOT: ldrd r[[REGNUM:[0-9]+]], {{r[0-9]+}}, [r[[REGNUM]]]
-; CHECK: bl{{x?}} _use_i64
+; CONSERVATIVE-NOT: ldrd
+; NORMAL: bl{{x?}} _use_i64
   %ptr = call i64* @get_ptr()
   %v = load i64, i64* %ptr, align 8
   call void @use_i64(i64 %v)
@@ -39,11 +42,10 @@ define void @test_ldrd(i64 %a) nounwind readonly {
 ; evict another live range or use callee saved regs. Sorry if the test
 ; is sensitive to Regalloc changes, but it is an interesting case.
 ;
-; BASIC: @f
+; CHECK-LABEL: f:
 ; BASIC: %bb
 ; BASIC: ldrd
 ; BASIC: str
-; GREEDY: @f
 ; GREEDY: %bb
 ; GREEDY: ldrd
 ; GREEDY: str
@@ -76,14 +78,15 @@ return:                                           ; preds = %bb, %entry
 
 @TestVar = external global %struct.Test
 
+; CHECK-LABEL: Func1:
 define void @Func1() nounwind ssp {
-; CHECK: @Func1
 entry: 
 ; A8: movw [[BASE:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}}
 ; A8: movt [[BASE]], :upper16:{{.*}}TestVar{{.*}}
 ; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4]
 ; A8-NEXT: add [[FIELD1]], [[FIELD2]]
 ; A8-NEXT: str [[FIELD1]], {{\[}}[[BASE]]{{\]}}
+; CONSERVATIVE-NOT: ldrd
   %orig_blocks = alloca [256 x i16], align 2
   %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start(i64 512, i8* %0) nounwind
   %tmp1 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 1), align 4
@@ -97,8 +100,9 @@ entry:
 declare void @extfunc(i32, i32, i32, i32)
 
 ; CHECK-LABEL: Func2:
+; CONSERVATIVE-NOT: ldrd
 ; A8: ldrd
-; A8: blx
+; CHECK: bl{{x?}} _extfunc
 ; A8: pop
 define void @Func2(i32* %p) {
 entry:
@@ -116,12 +120,14 @@ entry:
 ; M3: strd r1, r0, [sp, #-8]!
 ; BASIC: strd r1, r0, [sp, #-8]!
 ; GREEDY: strd r0, r1, [sp, #-8]!
-; CHECK: @ InlineAsm Start
-; CHECK: @ InlineAsm End
+; CONSERVATIVE: strd r0, r1, [sp, #-8]!
+; NORMAL: @ InlineAsm Start
+; NORMAL: @ InlineAsm End
 ; A8: ldrd r2, r1, [sp]
 ; M3: ldrd r2, r1, [sp]
 ; BASIC: ldrd r2, r1, [sp]
 ; GREEDY: ldrd r1, r2, [sp]
+; CONSERVATIVE: ldrd r1, r2, [sp]
 ; CHECK: bl{{x?}} _extfunc
 define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
   ; force %v0 and %v1 to be spilled
@@ -134,8 +140,9 @@ define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
 declare void @extfunc2(i32*, i32, i32)
 
 ; CHECK-LABEL: ldrd_postupdate_dec:
-; CHECK: ldrd r1, r2, [r0], #-8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
 define void @ldrd_postupdate_dec(i32* %p0) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   %v0 = load i32, i32* %p0
@@ -146,8 +153,9 @@ define void @ldrd_postupdate_dec(i32* %p0) {
 }
 
 ; CHECK-LABEL: ldrd_postupdate_inc:
-; CHECK: ldrd r1, r2, [r0], #8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
 define void @ldrd_postupdate_inc(i32* %p0) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   %v0 = load i32, i32* %p0
@@ -158,8 +166,9 @@ define void @ldrd_postupdate_inc(i32* %p0) {
 }
 
 ; CHECK-LABEL: strd_postupdate_dec:
-; CHECK: strd r1, r2, [r0], #-8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
 define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   store i32 %v0, i32* %p0
@@ -169,8 +178,9 @@ define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
 }
 
 ; CHECK-LABEL: strd_postupdate_inc:
-; CHECK: strd r1, r2, [r0], #8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
 define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   store i32 %v0, i32* %p0
diff --git a/llvm/test/CodeGen/ARM/swift-vldm.ll b/llvm/test/CodeGen/ARM/swift-vldm.ll
index 9e507279fa0..a53b2413bde 100644
--- a/llvm/test/CodeGen/ARM/swift-vldm.ll
+++ b/llvm/test/CodeGen/ARM/swift-vldm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
+; RUN: llc < %s -arm-assume-misaligned-load-store -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
 
 ; Check that we avoid producing vldm instructions using d registers that
 ; begin in the most-significant half of a q register. These require more
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll b/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll
index 28903aca326..a5b47411124 100644
--- a/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-ldm.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s -check-prefix=ALL -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=ALL -check-prefix=CONSERVATIVE
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 
 define i32 @t1() {
-; CHECK-LABEL: t1:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t1:
+; ALL: push {r7, lr}
 ; CHECK: ldrd
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
         %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 0)            ; <i32> [#uses=1]
         %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1)           ; <i32> [#uses=1]
         %tmp4 = call i32 @f1( i32 %tmp, i32 %tmp3 )                ; <i32> [#uses=1]
@@ -14,10 +17,12 @@ define i32 @t1() {
 }
 
 define i32 @t2() {
-; CHECK-LABEL: t2:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t2:
+; ALL: push {r7, lr}
 ; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
         %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2)            ; <i32> [#uses=1]
         %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
         %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 4)           ; <i32> [#uses=1]
@@ -26,10 +31,12 @@ define i32 @t2() {
 }
 
 define i32 @t3() {
-; CHECK-LABEL: t3:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t3:
+; ALL: push {r7, lr}
 ; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
         %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1)            ; <i32> [#uses=1]
         %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2)           ; <i32> [#uses=1]
         %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
@@ -37,6 +44,34 @@ define i32 @t3() {
         ret i32 %tmp6
 }
 
+@g = common global i32* null
+
+define void @t4(i32 %a0, i32 %a1, i32 %a2) {
+; ALL-LABEL: t4:
+; ALL: stm.w sp, {r0, r1, r2}
+; ALL: blx _ext
+; ALL: ldm.w sp, {r0, r1, r2}
+; ALL: blx _f2
+  %arr = alloca [4 x i32], align 4
+  %p0 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 0
+  %p1 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 1
+  %p2 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 2
+  store i32* %p0, i32** @g, align 8
+
+  store i32 %a0, i32* %p0, align 4
+  store i32 %a1, i32* %p1, align 4
+  store i32 %a2, i32* %p2, align 4
+  call void @ext()
+
+  %v0 = load i32, i32* %p0, align 4
+  %v1 = load i32, i32* %p1, align 4
+  %v2 = load i32, i32* %p2, align 4
+  call i32 @f2(i32 %v0, i32 %v1, i32 %v2)
+  ret void
+}
+
 declare i32 @f1(i32, i32)
 
 declare i32 @f2(i32, i32, i32)
+
+declare void @ext()
author	Matthias Braun <matze@braunis.de>	2016-03-02 19:20:00 +0000
committer	Matthias Braun <matze@braunis.de>	2016-03-02 19:20:00 +0000
commit	f290912d2248687e688779d1d89999df56c14a09 (patch)
tree	3fcf0e6854ae16ba357b814c0a0a16f1a88f313c
parent	578787ad3055d0a874bb76b148a9d5dc7bd18db5 (diff)
download	bcm5719-llvm-f290912d2248687e688779d1d89999df56c14a09.tar.gz bcm5719-llvm-f290912d2248687e688779d1d89999df56c14a09.zip