5 files changed, 624 insertions, 491 deletions
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h
index 2307288590a..db96b552931 100644
--- a/polly/include/polly/ScheduleOptimizer.h
+++ b/polly/include/polly/ScheduleOptimizer.h
@@ -12,6 +12,7 @@
 #ifndef POLLY_SCHEDULE_OPTIMIZER_H
 #define POLLY_SCHEDULE_OPTIMIZER_H
 
+#include "polly/DependenceInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "isl/ctx.h"
@@ -42,6 +43,31 @@ struct MacroKernelParamsTy {
 };
 
 namespace polly {
+/// Additional parameters of the schedule optimizer.
+///
+/// Target Transform Info and the SCoP dependencies used by the schedule
+/// optimizer.
+///
+struct OptimizerAdditionalInfoTy {
+  const llvm::TargetTransformInfo *TTI;
+  const Dependences *D;
+};
+
+/// Parameters of the matrix multiplication operands.
+///
+/// Parameters, which describe access relations that represent operands of the
+/// matrix multiplication.
+///
+struct MatMulInfoTy {
+  MemoryAccess *A = nullptr;
+  MemoryAccess *B = nullptr;
+  MemoryAccess *ReadFromC = nullptr;
+  MemoryAccess *WriteToC = nullptr;
+  int i = -1;
+  int j = -1;
+  int k = -1;
+};
+
 extern bool DisablePollyTiling;
 class Scop;
 } // namespace polly
@@ -59,11 +85,11 @@ public:
   ///
   /// @param Schedule The schedule object the transformations will be applied
   ///                 to.
-  /// @param TTI      Target Transform Info.
+  /// @param OAI      Target Transform Info and the SCoP dependencies.
   /// @returns        The transformed schedule.
   static __isl_give isl_schedule *
   optimizeSchedule(__isl_take isl_schedule *Schedule,
-                   const llvm::TargetTransformInfo *TTI = nullptr);
+                   const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
 
   /// Apply schedule tree transformations.
   ///
@@ -75,11 +101,11 @@ public:
   ///   - Prevectorization
   ///
   /// @param Node The schedule object post-transformations will be applied to.
-  /// @param TTI  Target Transform Info.
+  /// @param OAI  Target Transform Info and the SCoP dependencies.
   /// @returns    The transformed schedule.
   static __isl_give isl_schedule_node *
   optimizeScheduleNode(__isl_take isl_schedule_node *Node,
-                       const llvm::TargetTransformInfo *TTI = nullptr);
+                       const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
 
   /// Decide if the @p NewSchedule is profitable for @p S.
   ///
@@ -128,10 +154,11 @@ private:
 
   /// Apply the BLIS matmul optimization pattern.
   ///
-  /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three
-  /// nested loops around a macro-kernel, plus two packing routines.
-  /// The macro-kernel is implemented in terms of two additional loops around
-  /// a micro-kernel. The micro-kernel is a loop around a rank-1
+  /// Make the loops containing the matrix multiplication be the innermost
+  /// loops and apply the BLIS matmul optimization pattern. BLIS implements
+  /// gemm as three nested loops around a macro-kernel, plus two packing
+  /// routines. The macro-kernel is implemented in terms of two additional
+  /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
   /// (i.e., outer product) update.
   ///
   /// For a detailed description please see [1].
@@ -167,9 +194,13 @@ private:
   /// @param Node The node that contains a band to be optimized. The node
   ///             is required to successfully pass
   ///             ScheduleTreeOptimizer::isMatrMultPattern.
+  /// @param TTI  Target Transform Info.
+  /// @param MMI  Parameters of the matrix multiplication operands.
+  /// @returns    The transformed schedule.
   static __isl_give isl_schedule_node *
   optimizeMatMulPattern(__isl_take isl_schedule_node *Node,
-                        const llvm::TargetTransformInfo *TTI);
+                        const llvm::TargetTransformInfo *TTI,
+                        polly::MatMulInfoTy &MMI);
 
   /// Check if this node is a band node we want to tile.
   ///
@@ -266,7 +297,11 @@ private:
   /// the one used to get close-to-peak performance of matrix multiplications.
   ///
   /// @param Node The node to check.
-  static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node);
+  /// @param D    The SCoP dependencies.
+  /// @param MMI  Parameters of the matrix multiplication operands.
+  static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node,
+                                const polly::Dependences *D,
+                                polly::MatMulInfoTy &MMI);
 
   /// Create the BLIS macro-kernel.
   ///
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 4644692e94b..cb0a8edbf9c 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -468,26 +468,302 @@ ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
   return Node;
 }
 
-/// Check whether output dimensions of the map rely on the specified input
-/// dimension.
+/// Get the position of a dimension with a non-zero coefficient.
 ///
-/// @param IslMap The isl map to be considered.
-/// @param DimNum The number of an input dimension to be checked.
-static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
-  auto *CheckedAccessRelation =
-      isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
-  CheckedAccessRelation =
-      isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
-  auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
-  CheckedAccessRelation =
-      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
-  InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
-  CheckedAccessRelation =
-      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
-  auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
-  isl_map_free(CheckedAccessRelation);
-  isl_map_free(IslMap);
-  return res;
+/// Check that isl constraint @p Constraint has only one non-zero
+/// coefficient for dimensions that have type @p DimType. If this is true,
+/// return the position of the dimension corresponding to the non-zero
+/// coefficient and negative value, otherwise.
+///
+/// @param Constraint The isl constraint to be checked.
+/// @param DimType    The type of the dimensions.
+/// @return           The position of the dimension in case the isl
+///                   constraint satisfies the requirements, a negative
+///                   value, otherwise.
+static int getMatMulConstraintDim(__isl_keep isl_constraint *Constraint,
+                                  enum isl_dim_type DimType) {
+  int DimPos = -1;
+  auto *LocalSpace = isl_constraint_get_local_space(Constraint);
+  int LocalSpaceDimNum = isl_local_space_dim(LocalSpace, DimType);
+  for (int i = 0; i < LocalSpaceDimNum; i++) {
+    auto *Val = isl_constraint_get_coefficient_val(Constraint, DimType, i);
+    if (isl_val_is_zero(Val)) {
+      isl_val_free(Val);
+      continue;
+    }
+    if (DimPos >= 0 || (DimType == isl_dim_out && !isl_val_is_one(Val)) ||
+        (DimType == isl_dim_in && !isl_val_is_negone(Val))) {
+      isl_val_free(Val);
+      isl_local_space_free(LocalSpace);
+      return -1;
+    }
+    DimPos = i;
+    isl_val_free(Val);
+  }
+  isl_local_space_free(LocalSpace);
+  return DimPos;
+}
+
+/// Check the form of the isl constraint.
+///
+/// Check that the @p DimInPos input dimension of the isl constraint
+/// @p Constraint has a coefficient that is equal to negative one, the @p
+/// DimOutPos has a coefficient that is equal to one and others
+/// have coefficients equal to zero.
+///
+/// @param Constraint The isl constraint to be checked.
+/// @param DimInPos   The input dimension of the isl constraint.
+/// @param DimOutPos  The output dimension of the isl constraint.
+/// @return           isl_stat_ok in case the isl constraint satisfies
+///                   the requirements, isl_stat_error otherwise.
+static isl_stat isMatMulOperandConstraint(__isl_keep isl_constraint *Constraint,
+                                          int &DimInPos, int &DimOutPos) {
+  auto *Val = isl_constraint_get_constant_val(Constraint);
+  if (!isl_constraint_is_equality(Constraint) || !isl_val_is_zero(Val)) {
+    isl_val_free(Val);
+    return isl_stat_error;
+  }
+  isl_val_free(Val);
+  DimInPos = getMatMulConstraintDim(Constraint, isl_dim_in);
+  if (DimInPos < 0)
+    return isl_stat_error;
+  DimOutPos = getMatMulConstraintDim(Constraint, isl_dim_out);
+  if (DimOutPos < 0)
+    return isl_stat_error;
+  return isl_stat_ok;
+}
+
+/// Check that the access relation corresponds to a non-constant operand
+/// of the matrix multiplication.
+///
+/// Access relations that correspond to non-constant operands of the matrix
+/// multiplication depend only on two input dimensions and have two output
+/// dimensions. The function checks that the isl basic map @p bmap satisfies
+/// the requirements. The two input dimensions can be specified via @p user
+/// array.
+///
+/// @param bmap The isl basic map to be checked.
+/// @param user The input dimensions of @p bmap.
+/// @return     isl_stat_ok in case isl basic map satisfies the requirements,
+///             isl_stat_error otherwise.
+static isl_stat isMatMulOperandBasicMap(__isl_take isl_basic_map *bmap,
+                                        void *user) {
+  auto *Constraints = isl_basic_map_get_constraint_list(bmap);
+  isl_basic_map_free(bmap);
+  if (isl_constraint_list_n_constraint(Constraints) != 2) {
+    isl_constraint_list_free(Constraints);
+    return isl_stat_error;
+  }
+  int InPosPair[] = {-1, -1};
+  auto DimInPos = user ? static_cast<int *>(user) : InPosPair;
+  for (int i = 0; i < 2; i++) {
+    auto *Constraint = isl_constraint_list_get_constraint(Constraints, i);
+    int InPos, OutPos;
+    if (isMatMulOperandConstraint(Constraint, InPos, OutPos) ==
+            isl_stat_error ||
+        OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos)) {
+      isl_constraint_free(Constraint);
+      isl_constraint_list_free(Constraints);
+      return isl_stat_error;
+    }
+    DimInPos[OutPos] = InPos;
+    isl_constraint_free(Constraint);
+  }
+  isl_constraint_list_free(Constraints);
+  return isl_stat_ok;
+}
+
+/// Permute the two dimensions of the isl map.
+///
+/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
+/// have type @p DimType.
+///
+/// @param Map     The isl map to be modified.
+/// @param DimType The type of the dimensions.
+/// @param DstPos  The first dimension.
+/// @param SrcPos  The second dimension.
+/// @return        The modified map.
+__isl_give isl_map *permuteDimensions(__isl_take isl_map *Map,
+                                      enum isl_dim_type DimType,
+                                      unsigned DstPos, unsigned SrcPos) {
+  assert(DstPos < isl_map_dim(Map, DimType) &&
+         SrcPos < isl_map_dim(Map, DimType));
+  if (DstPos == SrcPos)
+    return Map;
+  isl_id *DimId = nullptr;
+  if (isl_map_has_tuple_id(Map, DimType))
+    DimId = isl_map_get_tuple_id(Map, DimType);
+  auto FreeDim = DimType == isl_dim_in ? isl_dim_out : isl_dim_in;
+  isl_id *FreeDimId = nullptr;
+  if (isl_map_has_tuple_id(Map, FreeDim))
+    FreeDimId = isl_map_get_tuple_id(Map, FreeDim);
+  auto MaxDim = std::max(DstPos, SrcPos);
+  auto MinDim = std::min(DstPos, SrcPos);
+  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MaxDim, 1);
+  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MinDim, 1);
+  Map = isl_map_move_dims(Map, DimType, MinDim, FreeDim, 1, 1);
+  Map = isl_map_move_dims(Map, DimType, MaxDim, FreeDim, 0, 1);
+  if (DimId)
+    Map = isl_map_set_tuple_id(Map, DimType, DimId);
+  if (FreeDimId)
+    Map = isl_map_set_tuple_id(Map, FreeDim, FreeDimId);
+  return Map;
+}
+
+/// Check the form of the access relation.
+///
+/// Check that the access relation @p AccMap has the form M[i][j], where i
+/// is a @p FirstPos and j is a @p SecondPos.
+///
+/// @param AccMap    The access relation to be checked.
+/// @param FirstPos  The index of the input dimension that is mapped to
+///                  the first output dimension.
+/// @param SecondPos The index of the input dimension that is mapped to the
+///                  second output dimension.
+/// @return          True in case @p AccMap has the expected form and false,
+///                  otherwise.
+static bool isMatMulOperandAcc(__isl_keep isl_map *AccMap, int &FirstPos,
+                               int &SecondPos) {
+  int DimInPos[] = {FirstPos, SecondPos};
+  if (isl_map_foreach_basic_map(AccMap, isMatMulOperandBasicMap,
+                                static_cast<void *>(DimInPos)) != isl_stat_ok ||
+      DimInPos[0] < 0 || DimInPos[1] < 0)
+    return false;
+  FirstPos = DimInPos[0];
+  SecondPos = DimInPos[1];
+  return true;
+}
+
+/// Does the memory access represent a non-scalar operand of the matrix
+/// multiplication.
+///
+/// Check that the memory access @p MemAccess is the read access to a non-scalar
+/// operand of the matrix multiplication or its result.
+///
+/// @param MemAccess The memory access to be checked.
+/// @param MMI       Parameters of the matrix multiplication operands.
+/// @return          True in case the memory access represents the read access
+///                  to a non-scalar operand of the matrix multiplication and
+///                  false, otherwise.
+static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
+                                        MatMulInfoTy &MMI) {
+  if (!MemAccess->isArrayKind() || !MemAccess->isRead())
+    return false;
+  isl_map *AccMap = MemAccess->getAccessRelation();
+  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j) && !MMI.ReadFromC &&
+      isl_map_n_basic_map(AccMap) == 1) {
+    MMI.ReadFromC = MemAccess;
+    isl_map_free(AccMap);
+    return true;
+  }
+  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k) && !MMI.A &&
+      isl_map_n_basic_map(AccMap) == 1) {
+    MMI.A = MemAccess;
+    isl_map_free(AccMap);
+    return true;
+  }
+  if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j) && !MMI.B &&
+      isl_map_n_basic_map(AccMap) == 1) {
+    MMI.B = MemAccess;
+    isl_map_free(AccMap);
+    return true;
+  }
+  isl_map_free(AccMap);
+  return false;
+}
+
+/// Check accesses to operands of the matrix multiplication.
+///
+/// Check that accesses of the SCoP statement, which corresponds to
+/// the partial schedule @p PartialSchedule, are scalar in terms of loops
+/// containing the matrix multiplication, in case they do not represent
+/// accesses to the non-scalar operands of the matrix multiplication or
+/// its result.
+///
+/// @param  PartialSchedule The partial schedule of the SCoP statement.
+/// @param  MMI             Parameters of the matrix multiplication operands.
+/// @return                 True in case the corresponding SCoP statement
+///                         represents matrix multiplication and false,
+///                         otherwise.
+static bool containsOnlyMatrMultAcc(__isl_keep isl_map *PartialSchedule,
+                                    MatMulInfoTy &MMI) {
+  auto *InputDimId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
+  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimId));
+  isl_id_free(InputDimId);
+  unsigned OutDimNum = isl_map_dim(PartialSchedule, isl_dim_out);
+  assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
+                          "and, consequently, the corresponding scheduling "
+                          "functions have at least three dimensions.");
+  auto *MapI = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
+                                 MMI.i, OutDimNum - 1);
+  auto *MapJ = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
+                                 MMI.j, OutDimNum - 1);
+  auto *MapK = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
+                                 MMI.k, OutDimNum - 1);
+  for (auto *MemA = Stmt->begin(); MemA != Stmt->end() - 1; MemA++) {
+    auto *MemAccessPtr = *MemA;
+    if (MemAccessPtr->isArrayKind() && MemAccessPtr != MMI.WriteToC &&
+        !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
+        !(MemAccessPtr->isStrideZero(isl_map_copy(MapI)) &&
+          MemAccessPtr->isStrideZero(isl_map_copy(MapJ)) &&
+          MemAccessPtr->isStrideZero(isl_map_copy(MapK)))) {
+      isl_map_free(MapI);
+      isl_map_free(MapJ);
+      isl_map_free(MapK);
+      return false;
+    }
+  }
+  isl_map_free(MapI);
+  isl_map_free(MapJ);
+  isl_map_free(MapK);
+  return true;
+}
+
+/// Check for dependencies corresponding to the matrix multiplication.
+///
+/// Check that there is only true dependence of the form
+/// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
+/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
+/// to the dependency produced by the matrix multiplication.
+///
+/// @param  Schedule The schedule of the SCoP statement.
+/// @param  D The SCoP dependencies.
+/// @param  Pos The parameter to desribe an acceptable true dependence.
+///             In case it has a negative value, try to determine its
+///             acceptable value.
+/// @return True in case dependencies correspond to the matrix multiplication
+///         and false, otherwise.
+static bool containsOnlyMatMulDep(__isl_keep isl_map *Schedule,
+                                  const Dependences *D, int &Pos) {
+  auto *WAR = D->getDependences(Dependences::TYPE_WAR);
+  if (!isl_union_map_is_empty(WAR)) {
+    isl_union_map_free(WAR);
+    return false;
+  }
+  isl_union_map_free(WAR);
+  auto *RAW = D->getDependences(Dependences::TYPE_RAW);
+  auto *Domain = isl_map_domain(isl_map_copy(Schedule));
+  auto *Space = isl_space_map_from_domain_and_range(isl_set_get_space(Domain),
+                                                    isl_set_get_space(Domain));
+  isl_set_free(Domain);
+  auto *Deltas = isl_map_deltas(isl_union_map_extract_map(RAW, Space));
+  int DeltasDimNum = isl_set_dim(Deltas, isl_dim_set);
+  for (int i = 0; i < DeltasDimNum; i++) {
+    auto *Val = isl_set_plain_get_val_if_fixed(Deltas, isl_dim_set, i);
+    if (Pos < 0 && isl_val_is_one(Val))
+      Pos = i;
+    if (isl_val_is_nan(Val) ||
+        !(isl_val_is_zero(Val) || (i == Pos && isl_val_is_one(Val)))) {
+      isl_val_free(Val);
+      isl_union_map_free(RAW);
+      isl_set_free(Deltas);
+      return false;
+    }
+    isl_val_free(Val);
+  }
+  isl_union_map_free(RAW);
+  isl_set_free(Deltas);
+  return true;
 }
 
 /// Check if the SCoP statement could probably be optimized with analytical
@@ -495,50 +771,57 @@ static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
 ///
 /// containsMatrMult tries to determine whether the following conditions
 /// are true:
-/// 1. all memory accesses of the statement will have stride 0 or 1,
-///    if we interchange loops (switch the variable used in the inner
-///    loop to the outer loop).
-/// 2. all memory accesses of the statement except from the last one, are
-///    read memory access and the last one is write memory access.
-/// 3. all subscripts of the last memory access of the statement don't contain
-///    the variable used in the inner loop.
+/// 1. The last memory access modeling an array, MA1, represents writing to
+///    memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
+///    S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
+///    under consideration.
+/// 2. There is only one loop-carried true dependency, and it has the
+///    form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
+///    loop-carried or anti dependencies.
+/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
+///    reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
+///    S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
+///    and all memory accesses of the SCoP that are different from MA1, MA2,
+///    MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
+///    of loops i1, i2 and i3.
 ///
 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
 ///        to check.
-static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
-  auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
-  auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
+/// @D     The SCoP dependencies.
+/// @MMI   Parameters of the matrix multiplication operands.
+static bool containsMatrMult(__isl_keep isl_map *PartialSchedule,
+                             const Dependences *D, MatMulInfoTy &MMI) {
+  auto *InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
+  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
   isl_id_free(InputDimsId);
-  if (ScpStmt->size() <= 1)
+  if (Stmt->size() <= 1)
     return false;
-  auto MemA = ScpStmt->begin();
-  for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
-       i++, MemA++)
-    if (!(*MemA)->isRead() ||
-        ((*MemA)->isArrayKind() &&
-         !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
-           (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
+  for (auto *MemA = Stmt->end() - 1; MemA != Stmt->begin(); MemA--) {
+    auto *MemAccessPtr = *MemA;
+    if (!MemAccessPtr->isArrayKind())
+      continue;
+    if (!MemAccessPtr->isWrite())
+      return false;
+    auto *AccMap = MemAccessPtr->getAccessRelation();
+    if (isl_map_n_basic_map(AccMap) != 1 ||
+        !isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) {
+      isl_map_free(AccMap);
       return false;
-  MemA++;
-  if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
-      !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
-        (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
+    }
+    isl_map_free(AccMap);
+    MMI.WriteToC = MemAccessPtr;
+    break;
+  }
+
+  if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
     return false;
-  auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
-  return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
-}
 
-/// Circular shift of output dimensions of the integer map.
-///
-/// @param IslMap The isl map to be modified.
-static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
-  auto DimNum = isl_map_dim(IslMap, isl_dim_out);
-  if (DimNum == 0)
-    return IslMap;
-  auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
-  IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
-  IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
-  return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
+  if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
+    return false;
+
+  if (!MMI.A || !MMI.B || !MMI.ReadFromC)
+    return false;
+  return true;
 }
 
 /// Permute two dimensions of the band node.
@@ -581,12 +864,15 @@ __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
   if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
       MacroKernelParams.Kc == 1)
     return Node;
-  Node = tileNode(
-      Node, "1st level tiling",
-      {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
+  int DimOutNum = isl_schedule_node_band_n_member(Node);
+  std::vector<int> TileSizes(DimOutNum, 1);
+  TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
+  TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
+  TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
+  Node = tileNode(Node, "1st level tiling", TileSizes, 1);
   Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
-  Node = permuteBandNodeDimensions(Node, 1, 2);
-  Node = permuteBandNodeDimensions(Node, 0, 2);
+  Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
+  Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
   return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 }
 
@@ -659,165 +945,6 @@ getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
   return {Mc, Nc, Kc};
 }
 
-/// Identify a memory access through the shape of its memory access relation.
-///
-/// Identify the unique memory access in @p Stmt, that has an access relation
-/// equal to @p ExpectedAccessRelation.
-///
-/// @param Stmt The SCoP statement that contains the memory accesses under
-///             consideration.
-/// @param ExpectedAccessRelation The access relation that identifies
-///                               the memory access.
-/// @return  The memory access of @p Stmt whose memory access relation is equal
-///          to @p ExpectedAccessRelation. nullptr in case there is no or more
-///          than one such access.
-MemoryAccess *
-identifyAccessByAccessRelation(ScopStmt *Stmt,
-                               __isl_take isl_map *ExpectedAccessRelation) {
-  if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
-    ExpectedAccessRelation =
-        isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
-  MemoryAccess *IdentifiedAccess = nullptr;
-  for (auto *Access : *Stmt) {
-    auto *AccessRelation = Access->getAccessRelation();
-    AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
-    if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
-      if (IdentifiedAccess) {
-        isl_map_free(AccessRelation);
-        isl_map_free(ExpectedAccessRelation);
-        return nullptr;
-      }
-      IdentifiedAccess = Access;
-    }
-    isl_map_free(AccessRelation);
-  }
-  isl_map_free(ExpectedAccessRelation);
-  return IdentifiedAccess;
-}
-
-/// Add constrains to @Dim dimension of @p ExtMap.
-///
-/// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
-/// the following constraint will be added
-/// Bound * OM <= IM <= Bound * (OM + 1) - 1,
-/// where M is @p Dim and Bound is @p Bound.
-///
-/// @param ExtMap The isl map to be modified.
-/// @param Dim The output dimension to be modfied.
-/// @param Bound The value that is used to specify the constraint.
-/// @return The modified isl map
-__isl_give isl_map *
-addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
-                                   unsigned Bound) {
-  assert(Bound != 0);
-  auto *ExtMapSpace = isl_map_get_space(ExtMap);
-  auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
-  auto *Constr =
-      isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
-  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
-  Constr =
-      isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
-  ExtMap = isl_map_add_constraint(ExtMap, Constr);
-  Constr = isl_constraint_alloc_inequality(ConstrSpace);
-  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
-  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
-  Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
-  return isl_map_add_constraint(ExtMap, Constr);
-}
-
-/// Create an access relation that is specific for matrix multiplication
-/// pattern.
-///
-/// Create an access relation of the following form:
-/// { [O0, O1, O2]->[I1, I2, I3] :
-///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
-///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
-///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
-///   where FirstOutputDimBound is @p FirstOutputDimBound,
-///   SecondOutputDimBound is @p SecondOutputDimBound,
-///   ThirdOutputDimBound is @p ThirdOutputDimBound
-///
-/// @param Ctx The isl context.
-/// @param FirstOutputDimBound,
-///        SecondOutputDimBound,
-///        ThirdOutputDimBound The parameters of the access relation.
-/// @return The specified access relation.
-__isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
-                                 unsigned SecondOutputDimBound,
-                                 unsigned ThirdOutputDimBound) {
-  auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
-  auto *extensionMap = isl_map_universe(NewRelSpace);
-  if (!FirstOutputDimBound)
-    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
-  else
-    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
-                                                      FirstOutputDimBound);
-  if (!SecondOutputDimBound)
-    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
-  else
-    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
-                                                      SecondOutputDimBound);
-  if (!ThirdOutputDimBound)
-    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
-  else
-    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
-                                                      ThirdOutputDimBound);
-  return extensionMap;
-}
-
-/// Create an access relation that is specific to the matrix
-///        multiplication pattern.
-///
-/// Create an access relation of the following form:
-/// Stmt[O0, O1, O2]->[OI, OJ],
-/// where I is @p I, J is @J
-///
-/// @param Stmt The SCoP statement for which to generate the access relation.
-/// @param I The index of the input dimension that is mapped to the first output
-///          dimension.
-/// @param J The index of the input dimension that is mapped to the second
-///          output dimension.
-/// @return The specified access relation.
-__isl_give isl_map *
-getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
-  auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
-  auto *AccessRel = isl_map_universe(AccessRelSpace);
-  AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
-  AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
-  AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
-  return AccessRel;
-}
-
-/// Identify the memory access that corresponds to the access to the second
-/// operand of the matrix multiplication.
-///
-/// Identify the memory access that corresponds to the access
-/// to the matrix B of the matrix multiplication C = A x B.
-///
-/// @param Stmt The SCoP statement that contains the memory accesses
-///             under consideration.
-/// @return The memory access of @p Stmt that corresponds to the access
-///         to the second operand of the matrix multiplication.
-MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
-  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
-  return identifyAccessByAccessRelation(Stmt, OriginalRel);
-}
-
-/// Identify the memory access that corresponds to the access to the first
-/// operand of the matrix multiplication.
-///
-/// Identify the memory access that corresponds to the access
-/// to the matrix A of the matrix multiplication C = A x B.
-///
-/// @param Stmt The SCoP statement that contains the memory accesses
-///             under consideration.
-/// @return The memory access of @p Stmt that corresponds to the access
-///         to the first operand of the matrix multiplication.
-MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
-  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
-  return identifyAccessByAccessRelation(Stmt, OriginalRel);
-}
-
 /// Create an access relation that is specific to
 ///        the matrix multiplication pattern.
 ///
@@ -893,21 +1020,15 @@ createExtensionNode(__isl_take isl_schedule_node *Node,
 ///                     transformations.
 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
 ///                                 to be taken into account.
+/// @param MMI Parameters of the matrix multiplication operands.
 /// @return The optimized schedule node.
 static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
     __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
-    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
-  // Check whether memory accesses of the SCoP statement correspond to
-  // the matrix multiplication pattern and if this is true, obtain them.
+    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams,
+    MatMulInfoTy &MMI) {
   auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
   auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
   isl_id_free(InputDimsId);
-  MemoryAccess *MemAccessA = identifyAccessA(Stmt);
-  MemoryAccess *MemAccessB = identifyAccessB(Stmt);
-  if (!MemAccessA || !MemAccessB) {
-    isl_map_free(MapOldIndVar);
-    return Node;
-  }
 
   // Create a copy statement that corresponds to the memory access to the
   // matrix B, the second operand of the matrix multiplication.
@@ -920,23 +1041,23 @@ static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
   unsigned SecondDimSize = MacroParams.Kc;
   unsigned ThirdDimSize = MicroParams.Nr;
   auto *SAI = Stmt->getParent()->createScopArrayInfo(
-      MemAccessB->getElementType(), "Packed_B",
+      MMI.B->getElementType(), "Packed_B",
       {FirstDimSize, SecondDimSize, ThirdDimSize});
   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-  auto *OldAcc = MemAccessB->getAccessRelation();
-  MemAccessB->setNewAccessRelation(AccRel);
+  auto *OldAcc = MMI.B->getAccessRelation();
+  MMI.B->setNewAccessRelation(AccRel);
   auto *ExtMap =
-      getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
-  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
-  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
-  ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
+      isl_map_project_out(isl_map_copy(MapOldIndVar), isl_dim_out, 2,
+                          isl_map_dim(MapOldIndVar, isl_dim_out) - 2);
+  ExtMap = isl_map_reverse(ExtMap);
+  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.i, 0);
   auto *Domain = Stmt->getDomain();
 
   // Restrict the domains of the copy statements to only execute when also its
   // originating statement is executed.
   auto *DomainId = isl_set_get_tuple_id(Domain);
   auto *NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
+      OldAcc, MMI.B->getAccessRelation(), isl_set_copy(Domain));
   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
   ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
   ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
@@ -945,20 +1066,21 @@ static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
   // Create a copy statement that corresponds to the memory access
   // to the matrix A, the first operand of the matrix multiplication.
   Node = isl_schedule_node_child(Node, 0);
-  AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
+  AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 4, 6);
   FirstDimSize = MacroParams.Mc / MicroParams.Mr;
   ThirdDimSize = MicroParams.Mr;
   SAI = Stmt->getParent()->createScopArrayInfo(
-      MemAccessA->getElementType(), "Packed_A",
+      MMI.A->getElementType(), "Packed_A",
       {FirstDimSize, SecondDimSize, ThirdDimSize});
   AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-  OldAcc = MemAccessA->getAccessRelation();
-  MemAccessA->setNewAccessRelation(AccRel);
-  ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
-  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
-  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
-  NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
+  OldAcc = MMI.A->getAccessRelation();
+  MMI.A->setNewAccessRelation(AccRel);
+  ExtMap = isl_map_project_out(MapOldIndVar, isl_dim_out, 3,
+                               isl_map_dim(MapOldIndVar, isl_dim_out) - 3);
+  ExtMap = isl_map_reverse(ExtMap);
+  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.j, 0);
+  NewStmt = Stmt->getParent()->addScopStmt(OldAcc, MMI.A->getAccessRelation(),
+                                           isl_set_copy(Domain));
 
   // Restrict the domains of the copy statements to only execute when also its
   // originating statement is executed.
@@ -998,8 +1120,19 @@ getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
 }
 
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
-    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
+    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI,
+    MatMulInfoTy &MMI) {
   assert(TTI && "The target transform info should be provided.");
+  int DimOutNum = isl_schedule_node_band_n_member(Node);
+  assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
+                          "and, consequently, the corresponding scheduling "
+                          "functions have at least three dimensions.");
+  Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
+  int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
+  int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
+  Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
+  NewK = MMI.k == DimOutNum - 2 ? MMI.j : MMI.k;
+  Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
   auto MicroKernelParams = getMicroKernelParams(TTI);
   auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
   Node = createMacroKernel(Node, MacroKernelParams);
@@ -1012,21 +1145,21 @@ __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
   if (!MapOldIndVar)
     return Node;
   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
-                                          MacroKernelParams);
+                                          MacroKernelParams, MMI);
 }
 
 bool ScheduleTreeOptimizer::isMatrMultPattern(
-    __isl_keep isl_schedule_node *Node) {
+    __isl_keep isl_schedule_node *Node, const Dependences *D,
+    MatMulInfoTy &MMI) {
   auto *PartialSchedule =
       isl_schedule_node_band_get_partial_schedule_union_map(Node);
-  if (isl_schedule_node_band_n_member(Node) != 3 ||
+  if (isl_schedule_node_band_n_member(Node) < 3 ||
       isl_union_map_n_map(PartialSchedule) != 1) {
     isl_union_map_free(PartialSchedule);
     return false;
   }
   auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
-  NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
-  if (containsMatrMult(NewPartialSchedule)) {
+  if (containsMatrMult(NewPartialSchedule, D, MMI)) {
     isl_map_free(NewPartialSchedule);
     return true;
   }
@@ -1040,11 +1173,13 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
   if (!isTileableBandNode(Node))
     return Node;
 
-  if (PMBasedOpts && User && isMatrMultPattern(Node)) {
+  const OptimizerAdditionalInfoTy *OAI =
+      static_cast<const OptimizerAdditionalInfoTy *>(User);
+
+  MatMulInfoTy MMI;
+  if (PMBasedOpts && User && isMatrMultPattern(Node, OAI->D, MMI)) {
     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
-    const llvm::TargetTransformInfo *TTI;
-    TTI = static_cast<const llvm::TargetTransformInfo *>(User);
-    Node = optimizeMatMulPattern(Node, TTI);
+    Node = optimizeMatMulPattern(Node, OAI->TTI, MMI);
   }
 
   return standardBandOpts(Node, User);
@@ -1052,9 +1187,9 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
 
 __isl_give isl_schedule *
 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
-                                        const llvm::TargetTransformInfo *TTI) {
+                                        const OptimizerAdditionalInfoTy *OAI) {
   isl_schedule_node *Root = isl_schedule_get_root(Schedule);
-  Root = optimizeScheduleNode(Root, TTI);
+  Root = optimizeScheduleNode(Root, OAI);
   isl_schedule_free(Schedule);
   auto S = isl_schedule_node_get_schedule(Root);
   isl_schedule_node_free(Root);
@@ -1062,9 +1197,9 @@ ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
 }
 
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
-    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
+    __isl_take isl_schedule_node *Node, const OptimizerAdditionalInfoTy *OAI) {
   Node = isl_schedule_node_map_descendant_bottom_up(
-      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
+      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(OAI)));
   return Node;
 }
 
@@ -1264,8 +1399,9 @@ bool IslScheduleOptimizer::runOnScop(Scop &S) {
 
   Function &F = S.getFunction();
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)};
   isl_schedule *NewSchedule =
-      ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
+      ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
 
   if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
     isl_schedule_free(NewSchedule);
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
index e63f80c56cd..b30d4e798c5 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -15,63 +15,49 @@
 ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
 
-define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) {
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {
 bb:
   br label %bb8
 
-bb8:                                              ; preds = %bb39, %bb
-  %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ]
-  %tmp9 = icmp slt i32 %tmp, 1056
-  br i1 %tmp9, label %bb10, label %bb41
-
-bb10:                                             ; preds = %bb8
-  br label %bb11
-
-bb11:                                             ; preds = %bb37, %bb10
-  %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ]
-  %tmp13 = icmp slt i32 %tmp12, 1056
-  br i1 %tmp13, label %bb14, label %bb39
-
-bb14:                                             ; preds = %bb11
-  %tmp15 = sext i32 %tmp12 to i64
-  %tmp16 = sext i32 %tmp to i64
-  %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16
-  %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15
-  %tmp19 = load double, double* %tmp18, align 8
-  %tmp20 = fmul double %tmp19, %arg4
-  store double %tmp20, double* %tmp18, align 8
-  br label %bb21
-
-bb21:                                             ; preds = %bb24, %bb14
-  %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ]
-  %tmp23 = icmp slt i32 %tmp22, 1024
-  br i1 %tmp23, label %bb24, label %bb37
-
-bb24:                                             ; preds = %bb21
-  %tmp25 = sext i32 %tmp22 to i64
-  %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16
-  %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25
-  %tmp28 = load double, double* %tmp27, align 8
-  %tmp29 = fmul double %arg3, %tmp28
-  %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25
-  %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15
-  %tmp32 = load double, double* %tmp31, align 8
-  %tmp33 = fmul double %tmp29, %tmp32
-  %tmp34 = load double, double* %tmp18, align 8
-  %tmp35 = fadd double %tmp34, %tmp33
-  store double %tmp35, double* %tmp18, align 8
-  %tmp36 = add nsw i32 %tmp22, 1
-  br label %bb21
-
-bb37:                                             ; preds = %bb21
-  %tmp38 = add nsw i32 %tmp12, 1
-  br label %bb11
-
-bb39:                                             ; preds = %bb11
-  %tmp40 = add nsw i32 %tmp, 1
-  br label %bb8
-
-bb41:                                             ; preds = %bb8
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
+
+bb32:                                             ; preds = %bb29
   ret void
 }
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
index 82b10a04710..5498f847687 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@@ -17,63 +17,49 @@
 ; CHECK-NOT: The matrix multiplication pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
 
-define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) {
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {
 bb:
   br label %bb8
 
-bb8:                                              ; preds = %bb39, %bb
-  %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ]
-  %tmp9 = icmp slt i32 %tmp, 1056
-  br i1 %tmp9, label %bb10, label %bb41
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
 
-bb10:                                             ; preds = %bb8
-  br label %bb11
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
 
-bb11:                                             ; preds = %bb37, %bb10
-  %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ]
-  %tmp13 = icmp slt i32 %tmp12, 1056
-  br i1 %tmp13, label %bb14, label %bb39
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
 
-bb14:                                             ; preds = %bb11
-  %tmp15 = sext i32 %tmp12 to i64
-  %tmp16 = sext i32 %tmp to i64
-  %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16
-  %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15
-  %tmp19 = load double, double* %tmp18, align 8
-  %tmp20 = fmul double %tmp19, %arg4
-  store double %tmp20, double* %tmp18, align 8
-  br label %bb21
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 2
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
 
-bb21:                                             ; preds = %bb24, %bb14
-  %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ]
-  %tmp23 = icmp slt i32 %tmp22, 1024
-  br i1 %tmp23, label %bb24, label %bb37
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
 
-bb24:                                             ; preds = %bb21
-  %tmp25 = sext i32 %tmp22 to i64
-  %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16
-  %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25
-  %tmp28 = load double, double* %tmp27, align 8
-  %tmp29 = fmul double %arg3, %tmp28
-  %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25
-  %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15
-  %tmp32 = load double, double* %tmp31, align 8
-  %tmp33 = fmul double %tmp29, %tmp32
-  %tmp34 = load double, double* %tmp18, align 8
-  %tmp35 = fadd double %tmp34, %tmp33
-  store double %tmp35, double* %tmp18, align 8
-  %tmp36 = add nsw i32 %tmp22, 1
-  br label %bb21
-
-bb37:                                             ; preds = %bb21
-  %tmp38 = add nsw i32 %tmp12, 2
-  br label %bb11
-
-bb39:                                             ; preds = %bb11
-  %tmp40 = add nsw i32 %tmp, 1
-  br label %bb8
-
-bb41:                                             ; preds = %bb8
+bb32:                                             ; preds = %bb29
   ret void
 }
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
index 75e81587007..0f9edf5b442 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:          // 1st level tiling - Points
 ; CHECK-NEXT:          for (int c2 = 0; c2 <= 31; c2 += 1)
 ; CHECK-NEXT:            for (int c3 = 0; c3 <= 31; c3 += 1)
-; CHECK-NEXT:              Stmt_bb14(32 * c0 + c2, 32 * c1 + c3);
+; CHECK-NEXT:              Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
 ; CHECK-NEXT:        }
 ; CHECK-NEXT:      // Register tiling - Tiles
 ; CHECK-NEXT:      for (int c0 = 0; c0 <= 131; c0 += 1)
@@ -41,38 +41,38 @@
 ; CHECK-NEXT:            // 1st level tiling - Tiles
 ; CHECK-NEXT:            // 1st level tiling - Points
 ; CHECK-NEXT:            {
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1, 8 * c0 + 7, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 1, 8 * c0 + 7, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 2, 8 * c0 + 7, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 1, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 2, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 3, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 4, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 5, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 6, c2);
-; CHECK-NEXT:              Stmt_bb24(4 * c1 + 3, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 7, c2);
 ; CHECK-NEXT:            }
 ; CHECK-NEXT:          }
 ; CHECK-NEXT:    }
@@ -84,11 +84,17 @@
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          // 1st level tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c2 = 0; c2 <= 31; c2 += 1)
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:            for (int c3 = 0; c3 <= 31; c3 += 1)
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:              Stmt_bb14(32 * c0 + c2, 32 * c1 + c3);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:              Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      // 1st level tiling - Tiles
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:      for (int c1 = 0; c1 <= 3; c1 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:      for (int c1 = 0; c1 <= 3; c1 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:        for (int c3 = 0; c3 <= 1055; c3 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:            CopyStmt_0(0, c3, c4);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        for (int c2 = 0; c2 <= 10; c2 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:            for (int c5 = 256 * c1; c5 <= 256 * c1 + 255; c5 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:              CopyStmt_1(c3, 0, c5);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          // 1st level tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          // Register tiling - Tiles
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c3 = 0; c3 <= 131; c3 += 1)
@@ -96,43 +102,44 @@
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:              for (int c5 = 0; c5 <= 255; c5 += 1) {
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // Register tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // 1st level tiling - Tiles
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // 1st level tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:               // 1st level tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                {
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:              }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        }
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:      }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:    }
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -142,60 +149,43 @@ define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3,
 bb:
   br label %bb8
 
-bb8:                                              ; preds = %bb39, %bb
-  %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ]
-  %tmp9 = icmp slt i32 %tmp, 1056
-  br i1 %tmp9, label %bb10, label %bb41
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
 
-bb10:                                             ; preds = %bb8
-  br label %bb11
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
 
-bb11:                                             ; preds = %bb37, %bb10
-  %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ]
-  %tmp13 = icmp slt i32 %tmp12, 1056
-  br i1 %tmp13, label %bb14, label %bb39
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
 
-bb14:                                             ; preds = %bb11
-  %tmp15 = sext i32 %tmp12 to i64
-  %tmp16 = sext i32 %tmp to i64
-  %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16
-  %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15
-  %tmp19 = load double, double* %tmp18, align 8
-  %tmp20 = fmul double %tmp19, %arg4
-  store double %tmp20, double* %tmp18, align 8
-  br label %bb21
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
 
-bb21:                                             ; preds = %bb24, %bb14
-  %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ]
-  %tmp23 = icmp slt i32 %tmp22, 1024
-  br i1 %tmp23, label %bb24, label %bb37
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
 
-bb24:                                             ; preds = %bb21
-  %tmp25 = sext i32 %tmp22 to i64
-  %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16
-  %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25
-  %tmp28 = load double, double* %tmp27, align 8
-  %tmp29 = fmul double %arg3, %tmp28
-  %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25
-  %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15
-  %tmp32 = load double, double* %tmp31, align 8
-  %tmp33 = fmul double %tmp29, %tmp32
-  %tmp34 = load double, double* %tmp18, align 8
-  %tmp35 = fadd double %tmp34, %tmp33
-  store double %tmp35, double* %tmp18, align 8
-  %tmp36 = add nsw i32 %tmp22, 1
-  br label %bb21
-
-bb37:                                             ; preds = %bb21
-  %tmp38 = add nsw i32 %tmp12, 1
-  br label %bb11
-
-bb39:                                             ; preds = %bb11
-  %tmp40 = add nsw i32 %tmp, 1
-  br label %bb8
-
-bb41:                                             ; preds = %bb8
+bb32:                                             ; preds = %bb29
   ret void
 }
-
-attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }