diff options
| -rw-r--r-- | llvm/include/llvm/CodeGen/TargetLowering.h | 6 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/MachineBlockPlacement.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARM.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMSubtarget.h | 7 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/loop-align-cortex-m.ll | 49 | 
7 files changed, 79 insertions, 1 deletions
| diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index e49cd3f188f..17603023ee3 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1435,6 +1435,12 @@ public:      return PrefLoopAlignment;    } +  /// Should loops be aligned even when the function is marked OptSize (but not +  /// MinSize). +  virtual bool alignLoopsWithOptSize() const { +    return false; +  } +    /// If the target has a standard location for the stack protector guard,    /// returns the address of that location. Otherwise, returns nullptr.    /// DEPRECATED: please override useLoadStackGuardNode and customize diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 21350df624e..624d3365b4f 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -2497,7 +2497,8 @@ void MachineBlockPlacement::alignBlocks() {    // exclusively on the loop info here so that we can align backedges in    // unnatural CFGs and backedges that were introduced purely because of the    // loop rotations done during this layout pass. -  if (F->getFunction().optForSize()) +  if (F->getFunction().optForMinSize() || +      (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize()))      return;    BlockChain &FunctionChain = *BlockToChain[&F->front()];    if (FunctionChain.begin() == FunctionChain.end()) diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 276ea789328..2d0f27031af 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -265,6 +265,9 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",  def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",                                               "Prefer 32-bit Thumb instrs">; +def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2", +                                              "Prefer 32-bit alignment for loops">; +  /// Some instructions update CPSR partially, which can add false dependency for  /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is  /// mapped to a separate physical register. Avoid partial CPSR update for these @@ -936,6 +939,7 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,  def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,                                                           ProcM3, +                                                         FeaturePrefLoopAlign32,                                                           FeatureHasNoBranchPredictor]>;  def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m, @@ -946,6 +950,7 @@ def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,                                                           FeatureVFP4,                                                           FeatureVFPOnlySP,                                                           FeatureD16, +                                                         FeaturePrefLoopAlign32,                                                           FeatureHasNoBranchPredictor]>;  def : ProcNoItin<"cortex-m7",                           [ARMv7em, @@ -960,6 +965,7 @@ def : ProcessorModel<"cortex-m33", CortexM3Model,       [ARMv8mMainline,                                                           FeatureFPARMv8,                                                           FeatureD16,                                                           FeatureVFPOnlySP, +                                                         FeaturePrefLoopAlign32,                                                           FeatureHasNoBranchPredictor]>;  def : ProcNoItin<"cortex-a32",                           [ARMv8a, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 4fed2025942..8b7b6b33193 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1199,6 +1199,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,    // Prefer likely predicted branches to selects on out-of-order cores.    PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); +  setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); +    setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);  } @@ -14695,6 +14697,11 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,                Addr});  } + +bool ARMTargetLowering::alignLoopsWithOptSize() const { +  return Subtarget->isMClass(); +} +  /// A helper function for determining the number of interleaved accesses we  /// will generate when lowering accesses of the given type.  unsigned diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 734b1ee5aa1..bf652bc7721 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -575,6 +575,8 @@ class VectorType;      bool isLegalInterleavedAccessType(VectorType *VecTy,                                        const DataLayout &DL) const; +    bool alignLoopsWithOptSize() const override; +      /// Returns the number of interleaved accesses that will be generated when      /// lowering accesses of the given type.      unsigned getNumInterleavedAccesses(VectorType *VecTy, diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index d7bfa896193..34938c399fe 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -438,6 +438,9 @@ protected:    /// operand cycle returned by the itinerary data for pre-ISel operands.    int PreISelOperandLatencyAdjustment = 2; +  /// What alignment is preferred for loop bodies, in log2(bytes). +  unsigned PrefLoopAlignment = 0; +    /// IsLittle - The target is Little Endian    bool IsLittle; @@ -804,6 +807,10 @@ public:    bool allowPositionIndependentMovt() const {      return isROPI() || !isTargetELF();    } + +  unsigned getPrefLoopAlignment() const { +    return PrefLoopAlignment; +  }  };  } // end namespace llvm diff --git a/llvm/test/CodeGen/ARM/loop-align-cortex-m.ll b/llvm/test/CodeGen/ARM/loop-align-cortex-m.ll new file mode 100644 index 00000000000..1b41c1b6c3f --- /dev/null +++ b/llvm/test/CodeGen/ARM/loop-align-cortex-m.ll @@ -0,0 +1,49 @@ +; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s + +define void @test_loop_alignment(i32* %in, i32*  %out) optsize { +; CHECK-LABEL: test_loop_alignment: +; CHECK: movs {{r[0-9]+}}, #0 +; CHECK: .p2align 2 + +entry: +  br label %loop + +loop: +  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] +  %in.addr = getelementptr inbounds i32, i32* %in, i32 %i +  %lhs = load i32, i32* %in.addr, align 4 +  %res = mul nsw i32 %lhs, 5 +  %out.addr = getelementptr inbounds i32, i32* %out, i32 %i +  store i32 %res, i32* %out.addr, align 4 +  %i.next = add i32 %i, 1 +  %done = icmp eq i32 %i.next, 1024 +  br i1 %done, label %end, label %loop + +end: +  ret void +} + +define void @test_loop_alignment_minsize(i32* %in, i32*  %out) minsize { +; CHECK-LABEL: test_loop_alignment_minsize: +; CHECK: movs {{r[0-9]+}}, #0 +; CHECK-NOT: .p2align + +entry: +  br label %loop + +loop: +  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] +  %in.addr = getelementptr inbounds i32, i32* %in, i32 %i +  %lhs = load i32, i32* %in.addr, align 4 +  %res = mul nsw i32 %lhs, 5 +  %out.addr = getelementptr inbounds i32, i32* %out, i32 %i +  store i32 %res, i32* %out.addr, align 4 +  %i.next = add i32 %i, 1 +  %done = icmp eq i32 %i.next, 1024 +  br i1 %done, label %end, label %loop + +end: +  ret void +} | 

