From c15d47bb013e975da582c8fd786ba8234d70d75d Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Thu, 13 Sep 2018 10:28:05 +0000
Subject: ARM: align loops to 4 bytes on Cortex-M3 and Cortex-M4.

The Technical Reference Manuals for these two CPUs state that branching
to an unaligned 32-bit instruction incurs an extra pipeline reload
penalty. That's bad.

This also enables the optimization at -Os since it costs on average one
byte per loop in return for 1 cycle per iteration, which is pretty good
going.

llvm-svn: 342127
---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'llvm/lib/CodeGen')

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 21350df624e..624d3365b4f 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2497,7 +2497,8 @@ void MachineBlockPlacement::alignBlocks() {
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F->getFunction().optForSize())
+  if (F->getFunction().optForMinSize() ||
+      (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize()))
     return;
   BlockChain &FunctionChain = *BlockToChain[&F->front()];
   if (FunctionChain.begin() == FunctionChain.end())
-- 
cgit v1.2.3