4 files changed, 197 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ed04cdc4e40..437f63d3280 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -17,8 +17,11 @@
 #define DEBUG_TYPE "x86tti"
 #include "X86.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -31,6 +34,17 @@ namespace llvm {
 void initializeX86TTIPass(PassRegistry &);
 }
 
+static cl::opt<bool>
+UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
+  cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
+  cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
+  cl::desc("Threshold for taken branches in X86 partial unrolling"),
+  cl::Hidden);
+
 namespace {
 
 class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -73,6 +87,8 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 
@@ -137,6 +153,93 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
+void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (!UsePartialUnrolling)
+    return;
+  // According to the Intel 64 and IA-32 Architectures Optimization Reference
+  // Manual, Intel Core models and later have a loop stream detector
+  // (and associated uop queue) that can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+  //    taken, and none of them may be calls.
+  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+  // According to the Software Optimization Guide for AMD Family 15h Processors,
+  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+  // buffer which can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have fewer than 16 branches
+  //  - The loop must have less than 40 uops in all executed loop branches
+
+  unsigned MaxBranches, MaxOps;
+  if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
+    MaxBranches = PartialUnrollingMaxBranches;
+    MaxOps = PartialUnrollingThreshold;
+  } else if (ST->isAtom()) {
+    // On the Atom, the throughput for taken branches is 2 cycles. For small
+    // simple loops, expand by a small factor to hide the backedge cost.
+    MaxBranches = 2;
+    MaxOps = 10;
+  } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
+    MaxBranches = 16;
+    MaxOps = 40;
+  } else if (ST->hasFMA4() /* Any other recent AMD */) {
+    return;
+  } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
+    MaxBranches = 8;
+    MaxOps = 28;
+  } else if (ST->hasSSSE3() /* Intel Core */) {
+    MaxBranches = 4;
+    MaxOps = 18;
+  } else {
+    return;
+  }
+
+  // Scan the loop: don't unroll loops with calls, and count the potential
+  // number of taken branches (this is somewhat conservative because we're
+  // counting all block transitions as potential branches while in reality some
+  // of these will become implicit via block placement).
+  unsigned MaxDepth = 0;
+  for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
+       DE = df_end(L->getHeader()); DI != DE;) {
+    if (!L->contains(*DI)) {
+      DI.skipChildren();
+      continue;
+    }
+
+    MaxDepth = std::max(MaxDepth, DI.getPathLength());
+    if (MaxDepth > MaxBranches)
+      return;
+
+    for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
+      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        ImmutableCallSite CS(I);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (!isLoweredToCall(F))
+            continue;
+        }
+
+        return;
+      }
+
+    ++DI;
+  }
+
+  // Enable runtime and partial unrolling up to the specified size.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+
+  // Set the maximum count based on the loop depth. The maximum number of
+  // branches taken in a loop (including the backedge) is equal to the maximum
+  // loop depth (the DFS path length from the loop header to any block in the
+  // loop). When the loop is unrolled, this depth (except for the backedge
+  // itself) is multiplied by the unrolling factor. This new unrolled depth
+  // must be less than the target-specific maximum branch count (which limits
+  // the number of taken branches in the uop buffer).
+  if (MaxDepth > 1)
+    UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
+}
+
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasSSE1())
     return 0;
diff --git a/llvm/test/Transforms/LoopUnroll/X86/lit.local.cfg b/llvm/test/Transforms/LoopUnroll/X86/lit.local.cfg
new file mode 100644
index 00000000000..ba763cf03ff
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/X86/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopUnroll/X86/partial.ll b/llvm/test/Transforms/LoopUnroll/X86/partial.ll
new file mode 100644
index 00000000000..15867cbea0a
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/X86/partial.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds double* %b, i64 %index
+  %1 = bitcast double* %0 to <2 x double>*
+  %wide.load = load <2 x double>* %1, align 8
+  %.sum9 = or i64 %index, 2
+  %2 = getelementptr double* %b, i64 %.sum9
+  %3 = bitcast double* %2 to <2 x double>*
+  %wide.load8 = load <2 x double>* %3, align 8
+  %4 = fadd <2 x double> %wide.load, <double 1.000000e+00, double 1.000000e+00>
+  %5 = fadd <2 x double> %wide.load8, <double 1.000000e+00, double 1.000000e+00>
+  %6 = getelementptr inbounds double* %a, i64 %index
+  %7 = bitcast double* %6 to <2 x double>*
+  store <2 x double> %4, <2 x double>* %7, align 8
+  %.sum10 = or i64 %index, 2
+  %8 = getelementptr double* %a, i64 %.sum10
+  %9 = bitcast double* %8 to <2 x double>*
+  store <2 x double> %5, <2 x double>* %9, align 8
+  %index.next = add i64 %index, 4
+  %10 = icmp eq i64 %index.next, 1600
+  br i1 %10, label %for.end, label %vector.body
+
+; FIXME: We should probably unroll this loop by a factor of 2, but the cost
+; model needs to be fixed to account for instructions likely to be folded
+; as part of an addressing mode.
+; CHECK-LABEL: @foo
+; CHECK-NOUNRL-LABEL: @foo
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+define void @bar(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %v0 = getelementptr inbounds double* %b, i64 %index
+  %v1 = bitcast double* %v0 to <2 x double>*
+  %wide.load = load <2 x double>* %v1, align 8
+  %v4 = fadd <2 x double> %wide.load, <double 1.000000e+00, double 1.000000e+00>
+  %v5 = fmul <2 x double> %v4, <double 8.000000e+00, double 8.000000e+00>
+  %v6 = getelementptr inbounds double* %a, i64 %index
+  %v7 = bitcast double* %v6 to <2 x double>*
+  store <2 x double> %v5, <2 x double>* %v7, align 8
+  %index.next = add i64 %index, 2
+  %v10 = icmp eq i64 %index.next, 1600
+  br i1 %v10, label %for.end, label %vector.body
+
+; FIXME: We should probably unroll this loop by a factor of 2, but the cost
+; model needs to first to fixed to account for instructions likely to be folded
+; as part of an addressing mode.
+
+; CHECK-LABEL: @bar
+; CHECK: fadd
+; CHECK-NEXT: fmul
+; CHECK: fadd
+; CHECK-NEXT: fmul
+
+; CHECK-NOUNRL-LABEL: @bar
+; CHECK-NOUNRL: fadd
+; CHECK-NOUNRL-NEXT: fmul
+; CHECK-NOUNRL-NOT: fadd
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index e98a4acddea..224823b8ed5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1,13 +1,13 @@
-; RUN: opt < %s -mcpu=corei7 -O1 -S | FileCheck %s --check-prefix=O1
-; RUN: opt < %s -mcpu=corei7 -O2 -S | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S | FileCheck %s --check-prefix=O3
-; RUN: opt < %s -mcpu=corei7 -Os -S | FileCheck %s --check-prefix=Os
-; RUN: opt < %s -mcpu=corei7 -Oz -S | FileCheck %s --check-prefix=Oz
-; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S | FileCheck %s --check-prefix=O1VEC
-; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S | FileCheck %s --check-prefix=OzVEC
-; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S | FileCheck %s --check-prefix=O1VEC2
-; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS
 
 ; This file tests the llvm.vectorizer.pragma forcing vectorization even when
 ; optimization levels are too low, or when vectorization is disabled.