summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohn Brawn <john.brawn@arm.com>2017-06-28 14:11:15 +0000
committerJohn Brawn <john.brawn@arm.com>2017-06-28 14:11:15 +0000
commit75d76e5e956c75fc524253d57153e40836d3e6d5 (patch)
tree67844a7a4f3db463b3d3abbbb476d2e64c04c14a
parent48b30c3d55b450f841b222394c840da12f0329a2 (diff)
downloadbcm5719-llvm-75d76e5e956c75fc524253d57153e40836d3e6d5.tar.gz
bcm5719-llvm-75d76e5e956c75fc524253d57153e40836d3e6d5.zip
[ARM] Improve if-conversion for M-class CPUs without branch predictors
The current heuristic in isProfitableToIfCvt assumes we have a branch predictor, and so gives the wrong answer in some cases when we don't. This patch adds a subtarget feature to indicate that a subtarget has no branch predictor, and changes the heuristic in isProfitableToiIfCvt when it's present. This gives a slight overall improvement in a set of embedded benchmarks on Cortex-M4 and Cortex-M33. Differential Revision: https://reviews.llvm.org/D34398 llvm-svn: 306547
-rw-r--r--llvm/lib/Target/ARM/ARM.td26
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp45
-rw-r--r--llvm/lib/Target/ARM/ARMSchedule.td1
-rw-r--r--llvm/lib/Target/ARM/ARMScheduleM3.td21
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.h6
-rw-r--r--llvm/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll154
6 files changed, 239 insertions, 14 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 840600914b1..c52a1d7611d 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
"Has return address stack">;
+// Some processors have no branch predictor, which changes the expected cost of
+// taking a branch which affects the choice of whether to use predicated
+// instructions.
+def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
+ "HasBranchPredictor", "false",
+ "Has no branch predictor">;
+
/// DSP extension.
def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
"Supports DSP instructions in ARM and/or Thumb2">;
@@ -756,13 +763,19 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
FeatureHasSlowFPVMLx,
FeatureAvoidPartialCPSR]>;
-def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>;
-def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>;
+def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m,
+ ProcM3,
+ FeatureHasNoBranchPredictor]>;
+
+def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m,
+ ProcM3,
+ FeatureHasNoBranchPredictor]>;
-def : ProcNoItin<"cortex-m4", [ARMv7em,
+def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
FeatureVFP4,
FeatureVFPOnlySP,
- FeatureD16]>;
+ FeatureD16,
+ FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-m7", [ARMv7em,
FeatureFPARMv8,
@@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m7", [ARMv7em,
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
FeatureNoMovt]>;
-def : ProcNoItin<"cortex-m33", [ARMv8mMainline,
+def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
FeatureDSP,
FeatureFPARMv8,
FeatureD16,
- FeatureVFPOnlySP]>;
+ FeatureVFPOnlySP,
+ FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e0810c358f2..1ec6b24b2ed 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
}
bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &,
+isProfitableToIfCvt(MachineBasicBlock &TBB,
unsigned TCycles, unsigned TExtra,
- MachineBasicBlock &,
+ MachineBasicBlock &FBB,
unsigned FCycles, unsigned FExtra,
BranchProbability Probability) const {
if (!TCycles)
@@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
// Here we scale up each component of UnpredCost to avoid precision issue when
// scaling TCycles/FCycles by Probability.
const unsigned ScalingUpFactor = 1024;
- unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
- unsigned FUnpredCost =
+
+ unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
+ unsigned UnpredCost;
+ if (!Subtarget.hasBranchPredictor()) {
+ // When we don't have a branch predictor it's always cheaper to not take a
+ // branch than take it, so we have to take that into account.
+ unsigned NotTakenBranchCost = 1;
+ unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
+ unsigned TUnpredCycles, FUnpredCycles;
+ if (!FCycles) {
+ // Triangle: TBB is the fallthrough
+ TUnpredCycles = TCycles + NotTakenBranchCost;
+ FUnpredCycles = TakenBranchCost;
+ } else {
+ // Diamond: TBB is the block that is branched to, FBB is the fallthrough
+ TUnpredCycles = TCycles + TakenBranchCost;
+ FUnpredCycles = FCycles + NotTakenBranchCost;
+ }
+ // The total cost is the cost of each path scaled by their probabilites
+ unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
+ unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
+ UnpredCost = TUnpredCost + FUnpredCost;
+ // When predicating assume that the first IT can be folded away but later
+ // ones cost one cycle each
+ if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
+ PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
+ }
+ } else {
+ unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+ unsigned FUnpredCost =
Probability.getCompl().scale(FCycles * ScalingUpFactor);
- unsigned UnpredCost = TUnpredCost + FUnpredCost;
- UnpredCost += 1 * ScalingUpFactor; // The branch itself
- UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+ UnpredCost = TUnpredCost + FUnpredCost;
+ UnpredCost += 1 * ScalingUpFactor; // The branch itself
+ UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+ }
- return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+ return PredCost <= UnpredCost;
}
bool
diff --git a/llvm/lib/Target/ARM/ARMSchedule.td b/llvm/lib/Target/ARM/ARMSchedule.td
index 1c7902520f2..53e012f13ee 100644
--- a/llvm/lib/Target/ARM/ARMSchedule.td
+++ b/llvm/lib/Target/ARM/ARMSchedule.td
@@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
include "ARMScheduleSwift.td"
include "ARMScheduleR52.td"
include "ARMScheduleA57.td"
+include "ARMScheduleM3.td"
diff --git a/llvm/lib/Target/ARM/ARMScheduleM3.td b/llvm/lib/Target/ARM/ARMScheduleM3.td
new file mode 100644
index 00000000000..93f8299f9bd
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMScheduleM3.td
@@ -0,0 +1,21 @@
+//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-M3 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM3Model : SchedMachineModel {
+ let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue
+ let MicroOpBufferSize = 0; // In-order
+ let LoadLatency = 2; // Latency when not pipelined, not pc-relative
+ let MispredictPenalty = 2; // Best case branch taken cost
+
+ let CompleteModel = 0;
+}
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index d890d0fa777..e15b17512c9 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -246,6 +246,11 @@ protected:
/// avoid issue "normal" call instructions to callees which do not return.
bool HasRetAddrStack = false;
+ /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
+ /// a branch predictor or not changes the expected cost of taking a branch
+ /// which affects the choice of whether to use predicated instructions.
+ bool HasBranchPredictor = true;
+
/// HasMPExtension - True if the subtarget supports Multiprocessing
/// extension (ARMv7 only).
bool HasMPExtension = false;
@@ -554,6 +559,7 @@ public:
bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
bool hasRetAddrStack() const { return HasRetAddrStack; }
+ bool hasBranchPredictor() const { return HasBranchPredictor; }
bool hasMPExtension() const { return HasMPExtension; }
bool hasDSP() const { return HasDSP; }
bool useNaClTrap() const { return UseNaClTrap; }
diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll b/llvm/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
new file mode 100644
index 00000000000..9fcc0f5d617
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
@@ -0,0 +1,154 @@
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP
+
+declare void @otherfn()
+
+; CHECK-LABEL: triangle1:
+; CHECK: itt ne
+; CHECK: movne
+; CHECK: strne
+define i32 @triangle1(i32 %n, i32* %p) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ br label %if.end
+
+if.end:
+ tail call void @otherfn()
+ ret i32 0
+}
+
+; CHECK-LABEL: triangle2:
+; CHECK-BP: itttt ne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+define i32 @triangle2(i32 %n, i32* %p, i32* %q) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ store i32 2, i32* %q, align 4
+ br label %if.end
+
+if.end:
+ tail call void @otherfn()
+ ret i32 0
+}
+
+; CHECK-LABEL: triangle3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ store i32 2, i32* %q, align 4
+ store i32 3, i32* %r, align 4
+ br label %if.end
+
+if.end:
+ tail call void @otherfn()
+ ret i32 0
+}
+
+; CHECK-LABEL: diamond1:
+; CHECK: ite eq
+; CHECK: ldreq
+; CHECK: strne
+define i32 @diamond1(i32 %n, i32* %p) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+ store i32 %n, i32* %p, align 4
+ br label %if.end
+
+if.else:
+ %0 = load i32, i32* %p, align 4
+ br label %if.end
+
+if.end:
+ %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+ tail call void @otherfn()
+ ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond2:
+; CHECK-BP: itte
+; CHECK-BP: streq
+; CHECK-BP: ldreq
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: str
+; CHECK-NOBP: b
+; CHECK-NOBP: str
+; CHECK-NOBP: ldr
+define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+ store i32 %n, i32* %p, align 4
+ br label %if.end
+
+if.else:
+ store i32 %m, i32* %q, align 4
+ %0 = load i32, i32* %p, align 4
+ br label %if.end
+
+if.end:
+ %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+ tail call void @otherfn()
+ ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: b
+; CHECK: ldr
+; CHECK: ldr
+; CHECK: adds
+define i32 @diamond3(i32 %n, i32* %p, i32* %q) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ br label %if.end
+
+if.else:
+ %0 = load i32, i32* %p, align 4
+ %1 = load i32, i32* %q, align 4
+ %add = add nsw i32 %1, %0
+ br label %if.end
+
+if.end:
+ %n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ]
+ tail call void @otherfn()
+ ret i32 %n.addr.0
+}
OpenPOWER on IntegriCloud