summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorEli Friedman <efriedma@codeaurora.org>2017-03-01 22:56:20 +0000
committerEli Friedman <efriedma@codeaurora.org>2017-03-01 22:56:20 +0000
commit28c2c0e311b3055062213388f967fc75c72976dc (patch)
tree893538ce433a140ed8029196cba86471fa015d92 /llvm
parentac2d815a2aec2aca213bb9b919237ee407d0df51 (diff)
downloadbcm5719-llvm-28c2c0e311b3055062213388f967fc75c72976dc.tar.gz
bcm5719-llvm-28c2c0e311b3055062213388f967fc75c72976dc.zip
[ARM] Check correct instructions for load/store rescheduling.
This code starts from the high end of the sorted vector of offsets, and works backwards: it tries to find contiguous offsets, process them, then pops them from the end of the vector. Most of the code agrees with this order of processing, but one loop doesn't: it instead processes elements from the low end of the vector (which are nodes with unrelated offsets). Fix that loop to process the correct elements. This has a few implications. One, we don't incorrectly return early when processing multiple groups of offsets in the same block (which allows rescheduling prera-ldst-insertpt.mir). Two, we pick the correct insert point for loads, so they're correctly sorted (which affects the scheduling of vldm-liveness.ll). I think it might also impact some of the heuristics slightly. Differential Revision: https://reviews.llvm.org/D30368 llvm-svn: 296701
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp2
-rw-r--r--llvm/test/CodeGen/ARM/prera-ldst-insertpt.mir99
-rw-r--r--llvm/test/CodeGen/ARM/vldm-liveness.ll19
-rw-r--r--llvm/test/CodeGen/ARM/vldm-liveness.mir40
4 files changed, 143 insertions, 17 deletions
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 8e51f136b32..f9eec7d4006 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2195,7 +2195,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
else {
SmallPtrSet<MachineInstr*, 4> MemOps;
SmallSet<unsigned, 4> MemRegs;
- for (int i = NumMove-1; i >= 0; --i) {
+ for (size_t i = Ops.size() - NumMove, e = Ops.size(); i != e; ++i) {
MemOps.insert(Ops[i]);
MemRegs.insert(Ops[i]->getOperand(0).getReg());
}
diff --git a/llvm/test/CodeGen/ARM/prera-ldst-insertpt.mir b/llvm/test/CodeGen/ARM/prera-ldst-insertpt.mir
new file mode 100644
index 00000000000..1db38c8b004
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/prera-ldst-insertpt.mir
@@ -0,0 +1,99 @@
+# RUN: llc -run-pass arm-prera-ldst-opt %s -o - | FileCheck %s
+--- |
+ target triple = "thumbv7---eabi"
+
+ define void @a(i32* nocapture %x, i32 %y, i32 %z) {
+ entry:
+ ret void
+ }
+
+ define void @b(i32* nocapture %x, i32 %y, i32 %z) {
+ entry:
+ ret void
+ }
+...
+---
+# CHECK-LABEL: name: a
+name: a
+alignment: 1
+tracksRegLiveness: true
+liveins:
+ - { reg: '%r0', virtual-reg: '%0' }
+ - { reg: '%r1', virtual-reg: '%1' }
+ - { reg: '%r2', virtual-reg: '%2' }
+body: |
+ bb.0.entry:
+ liveins: %r0, %r1, %r2
+
+ %2 : rgpr = COPY %r2
+ %1 : rgpr = COPY %r1
+ %0 : gpr = COPY %r0
+ %3 : rgpr = t2MUL %2, %2, 14, _
+ %4 : rgpr = t2MUL %1, %1, 14, _
+ %5 : rgpr = t2MOVi32imm -858993459
+ %6 : rgpr, %7 : rgpr = t2UMULL killed %3, %5, 14, _
+ %8 : rgpr, %9 : rgpr = t2UMULL killed %4, %5, 14, _
+ t2STRi12 %1, %0, 0, 14, _ :: (store 4)
+ %10 : rgpr = t2LSLri %2, 1, 14, _, _
+ t2STRi12 killed %10, %0, 4, 14, _ :: (store 4)
+ %11 : rgpr = t2MOVi 55, 14, _, _
+ %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _
+ t2STRi12 killed %12, %0, 16, 14, _ :: (store 4)
+ %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _
+ t2STRi12 killed %13, %0, 20, 14, _ :: (store 4)
+
+ ; Make sure we move the paired stores next to each other.
+ ; FIXME: Make sure we don't extend the live-range of a store
+ ; when we don't need to.
+ ; CHECK: t2STRi12 %1,
+ ; CHECK-NEXT: t2STRi12 killed %10,
+ ; CHECK-NEXT: %13 = t2ADDrs %11
+ ; CHECK-NEXT: t2STRi12 killed %12,
+ ; CHECK-NEXT: t2STRi12 killed %13,
+
+ tBX_RET 14, _
+---
+# CHECK-LABEL: name: b
+name: b
+alignment: 1
+tracksRegLiveness: true
+liveins:
+ - { reg: '%r0', virtual-reg: '%0' }
+ - { reg: '%r1', virtual-reg: '%1' }
+ - { reg: '%r2', virtual-reg: '%2' }
+body: |
+ bb.0.entry:
+ liveins: %r0, %r1, %r2
+
+ %2 : rgpr = COPY %r2
+ %1 : rgpr = COPY %r1
+ %0 : gpr = COPY %r0
+ t2STRi12 %1, %0, 0, 14, _ :: (store 4)
+ %10 : rgpr = t2LSLri %2, 1, 14, _, _
+ t2STRi12 killed %10, %0, 4, 14, _ :: (store 4)
+ %3 : rgpr = t2MUL %2, %2, 14, _
+ t2STRi12 %3, %0, 8, 14, _ :: (store 4)
+ %4 : rgpr = t2MUL %1, %1, 14, _
+ %5 : rgpr = t2MOVi32imm -858993459
+ %6 : rgpr, %7 : rgpr = t2UMULL killed %3, %5, 14, _
+ %8 : rgpr, %9 : rgpr = t2UMULL killed %4, %5, 14, _
+ %10 : rgpr = t2LSLri %2, 1, 14, _, _
+ %11 : rgpr = t2MOVi 55, 14, _, _
+ %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _
+ t2STRi12 killed %12, %0, 16, 14, _ :: (store 4)
+ %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _
+ t2STRi12 killed %13, %0, 20, 14, _ :: (store 4)
+
+ ; Make sure we move the paired stores next to each other.
+ ; FIXME: Make sure we don't extend the live-range of a store
+ ; when we don't need to.
+ ; CHECK: t2STRi12 {{.*}}, 0
+ ; CHECK-NEXT: t2STRi12 {{.*}}, 4
+ ; CHECK-NEXT: t2STRi12 {{.*}}, 8
+ ; CHECK-NEXT: t2ADDrs
+ ; CHECK-NEXT: t2STRi12 {{.*}}, 16
+ ; CHECK-NEXT: t2STRi12 {{.*}}, 20
+
+ tBX_RET 14, _
+
+...
diff --git a/llvm/test/CodeGen/ARM/vldm-liveness.ll b/llvm/test/CodeGen/ARM/vldm-liveness.ll
index e114e6970a3..63dc9d61ebc 100644
--- a/llvm/test/CodeGen/ARM/vldm-liveness.ll
+++ b/llvm/test/CodeGen/ARM/vldm-liveness.ll
@@ -1,26 +1,13 @@
; RUN: llc -mtriple thumbv7-apple-ios -verify-machineinstrs -o - %s | FileCheck %s
-; ARM load store optimizer was dealing with a sequence like:
-; s1 = VLDRS [r0, 1], Q0<imp-def>
-; s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
-; s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
-; s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
+; Make sure we emit the loads in ascending order, and form a vldmia.
;
-; It decided to combine the {s0, s1} loads into a single instruction in the
-; third position. However, this leaves the instruction defining s3 with a stray
-; imp-use of Q0, which is undefined.
-;
-; The verifier catches this, so this test just makes sure that appropriate
-; liveness flags are added.
-;
-; I believe the change will be tested as long as the vldmia is not the first of
-; the loads. Earlier optimisations may perturb the output over time, but
-; fiddling the indices should be sufficient to restore the test.
+; See vldm-liveness.mir for the bug this file originally testing.
define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
; CHECK-LABEL: foo:
-; CHECK: vldr s3, [r0, #8]
; CHECK: vldmia r0, {s0, s1}
+; CHECK: vldr s3, [r0, #8]
; CHECK: vldr s2, [r0, #16]
%off0 = getelementptr float, float* %ptr, i32 0
%val0 = load float, float* %off0
diff --git a/llvm/test/CodeGen/ARM/vldm-liveness.mir b/llvm/test/CodeGen/ARM/vldm-liveness.mir
new file mode 100644
index 00000000000..a85a018a8b1
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/vldm-liveness.mir
@@ -0,0 +1,40 @@
+# RUN: llc -run-pass arm-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+# ARM load store optimizer was dealing with a sequence like:
+# s1 = VLDRS [r0, 1], Q0<imp-def>
+# s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
+# s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
+# s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
+#
+# It decided to combine the {s0, s1} loads into a single instruction in the
+# third position. However, this leaves the instruction defining s3 with a stray
+# imp-use of Q0, which is undefined.
+#
+# The verifier catches this, so this test just makes sure that appropriate
+# liveness flags are added.
+--- |
+ target triple = "thumbv7-apple-ios"
+ define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
+ ret <4 x float> undef
+ }
+...
+---
+name: foo
+alignment: 1
+liveins:
+ - { reg: '%r0' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %r0
+
+ %s1 = VLDRS %r0, 1, 14, _, implicit-def %q0 :: (load 4)
+ %s3 = VLDRS %r0, 2, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+ ; CHECK: %s3 = VLDRS %r0, 2, 14, _, implicit killed undef %q0, implicit-def %q0 :: (load 4)
+
+ %s0 = VLDRS %r0, 0, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+ ; CHECK: VLDMSIA %r0, 14, _, def %s0, def %s1, implicit-def _
+
+ %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+ ; CHECK: %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+
+ tBX_RET 14, _, implicit %q0
+...
OpenPOWER on IntegriCloud