summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp108
-rw-r--r--llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll2
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll4
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll60
4 files changed, 171 insertions, 3 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 73436f13c94..b4a3591079e 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -140,6 +140,13 @@ static cl::opt<bool> LSRExpNarrow(
cl::desc("Narrow LSR complex solution using"
" expectation of registers number"));
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+ "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+ cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+ " with the same ScaledReg and Scale"));
+
#ifndef NDEBUG
// Stress test IV chain generation.
static cl::opt<bool> StressIVChain(
@@ -1902,6 +1909,7 @@ class LSRInstance {
void NarrowSearchSpaceByDetectingSupersets();
void NarrowSearchSpaceByCollapsingUnrolledCode();
void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
void NarrowSearchSpaceByDeletingCostlyFormulas();
void NarrowSearchSpaceByPickingWinnerRegs();
void NarrowSearchSpaceUsingHeuristics();
@@ -4306,6 +4314,104 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
}
}
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heurstic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ DEBUG(dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the best Formula "
+ "from the Formulae with the same Scale and ScaledReg.\n");
+
+ // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+ typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy;
+ BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+ bool ChangedFormulae = false;
+#endif
+ DenseSet<const SCEV *> VisitedRegs;
+ SmallPtrSet<const SCEV *, 16> Regs;
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+ // Return true if Formula FA is better than Formula FB.
+ auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+ // First we will try to choose the Formula with fewer new registers.
+ // For a register used by current Formula, the more the register is
+ // shared among LSRUses, the less we increase the register number
+ // counter of the formula.
+ size_t FARegNum = 0;
+ for (const SCEV *Reg : FA.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FARegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ size_t FBRegNum = 0;
+ for (const SCEV *Reg : FB.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FBRegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ if (FARegNum != FBRegNum)
+ return FARegNum < FBRegNum;
+
+ // If the new register numbers are the same, choose the Formula with
+ // less Cost.
+ Cost CostFA, CostFB;
+ Regs.clear();
+ CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
+ Regs.clear();
+ CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
+ return CostFA.isLess(CostFB, TTI);
+ };
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+ ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+ if (!F.ScaledReg)
+ continue;
+ auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+ if (P.second)
+ continue;
+
+ Formula &Best = LU.Formulae[P.first->second];
+ if (IsBetterThan(F, Best))
+ std::swap(F, Best);
+ DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+ ChangedFormulae = true;
+#endif
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ // Reset this to prepare for the next use.
+ BestFormulae.clear();
+ }
+
+ DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
+}
+
/// The function delete formulas with high registers number expectation.
/// Assuming we don't know the value of each formula (already delete
/// all inefficient), generate probability of not selecting for each
@@ -4516,6 +4622,8 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
NarrowSearchSpaceByDetectingSupersets();
NarrowSearchSpaceByCollapsingUnrolledCode();
NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ if (FilterSameScaledReg)
+ NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
if (LSRExpNarrow)
NarrowSearchSpaceByDeletingCostlyFormulas();
else
diff --git a/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
index ba8ff1bc181..3bb14c4b1cd 100644
--- a/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
+++ b/llvm/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
+; RUN: llc -lsr-filter-same-scaled-reg=false < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
; Test case for the recoloring of broken hints.
; This is tricky to have something reasonably small to kick this optimization since
; it requires that spliting and spilling occur.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
index dcd068191e1..ea3f6077231 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
@@ -14,8 +14,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; current LSR cost model.
; CHECK-NOT: = ptrtoint i8* undef to i64
; CHECK: .lr.ph
-; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1
-; CHECK: sub i64 [[TMP]], %tmp6
+; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp{{[0-9]+}}, -1
+; CHECK: sub i64 [[TMP]], %tmp{{[0-9]+}}
; CHECK: ret void
define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
bb:
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll
new file mode 100644
index 00000000000..4ce6f1a79fb
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -loop-reduce -lsr-filter-same-scaled-reg=true -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ham = type { i8, i8, [5 x i32], i64, i64, i64 }
+
+@global = external local_unnamed_addr global %struct.ham, align 8
+
+define void @foo() local_unnamed_addr {
+bb:
+ %tmp = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 3), align 8
+ %tmp1 = and i64 %tmp, 1792
+ %tmp2 = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 4), align 8
+ %tmp3 = add i64 %tmp1, %tmp2
+ %tmp4 = load i8*, i8** null, align 8
+ %tmp5 = getelementptr inbounds i8, i8* %tmp4, i64 0
+ %tmp6 = sub i64 0, %tmp3
+ %tmp7 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp6
+ %tmp8 = inttoptr i64 0 to i8*
+ br label %bb9
+
+; Without filtering non-optimal formulae with the same ScaledReg and Scale, the strategy
+; to narrow LSR search space by picking winner reg will generate only one lsr.iv and
+; unoptimal result.
+; CHECK-LABEL: @foo(
+; CHECK: bb9:
+; CHECK-NEXT: = phi i8*
+; CHECK-NEXT: = phi i8*
+
+bb9: ; preds = %bb12, %bb
+ %tmp10 = phi i8* [ %tmp7, %bb ], [ %tmp16, %bb12 ]
+ %tmp11 = phi i8* [ %tmp8, %bb ], [ %tmp17, %bb12 ]
+ br i1 false, label %bb18, label %bb12
+
+bb12: ; preds = %bb9
+ %tmp13 = getelementptr inbounds i8, i8* %tmp10, i64 8
+ %tmp14 = bitcast i8* %tmp13 to i64*
+ %tmp15 = load i64, i64* %tmp14, align 1
+ %tmp16 = getelementptr inbounds i8, i8* %tmp10, i64 16
+ %tmp17 = getelementptr inbounds i8, i8* %tmp11, i64 16
+ br label %bb9
+
+bb18: ; preds = %bb9
+ %tmp19 = icmp ugt i8* %tmp11, null
+ %tmp20 = getelementptr inbounds i8, i8* %tmp10, i64 8
+ %tmp21 = getelementptr inbounds i8, i8* %tmp11, i64 8
+ %tmp22 = select i1 %tmp19, i8* %tmp10, i8* %tmp20
+ %tmp23 = select i1 %tmp19, i8* %tmp11, i8* %tmp21
+ br label %bb24
+
+bb24: ; preds = %bb24, %bb18
+ %tmp25 = phi i8* [ %tmp27, %bb24 ], [ %tmp22, %bb18 ]
+ %tmp26 = phi i8* [ %tmp29, %bb24 ], [ %tmp23, %bb18 ]
+ %tmp27 = getelementptr inbounds i8, i8* %tmp25, i64 1
+ %tmp28 = load i8, i8* %tmp25, align 1
+ %tmp29 = getelementptr inbounds i8, i8* %tmp26, i64 1
+ store i8 %tmp28, i8* %tmp26, align 1
+ %tmp30 = icmp eq i8* %tmp29, %tmp5
+ br label %bb24
+}
OpenPOWER on IntegriCloud