Update createAffineComputationSlice to generate single result affine maps

- Update createAffineComputationSlice to generate a sequence of single result affine apply ops instead of one multi-result affine apply - update pipeline-data-transfer test case; while on this, also update the test case to use only single result affine maps, and make it more robust to change. PiperOrigin-RevId: 230965478
author: Uday Bondhugula <bondhugula@google.com> 2019-01-25 14:06:32 -0800
committer: jpienaar <jpienaar@google.com> 2019-03-29 15:37:53 -0700
commit: b588d58c5f2c8c570600210050bd9beaed0c3a24 (patch)
tree: 91c0072ea3bbe731d932c532c656cebc34c68599
parent: c3424c3c7526641a08c993ab881c1d4e237bfddb (diff)
download: bcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.tar.gz
bcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.zip
4 files changed, 136 insertions, 109 deletions
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h
index 2a1505aecb7..b3fe4471699 100644
--- a/mlir/include/mlir/Transforms/Utils.h
+++ b/mlir/include/mlir/Transforms/Utils.h
@@ -82,10 +82,10 @@ createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
                             ArrayRef<OperationInst *> affineApplyOps,
                             SmallVectorImpl<Value *> *results);
 
-/// Given an operation instruction, inserts a new single affine apply operation,
-/// that is exclusively used by this operation instruction, and that provides
-/// all operands that are results of an affine_apply as a function of loop
-/// iterators and program parameters and whose results are.
+/// Given an operation instruction, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// instruction. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
 ///
 /// Before
 ///
@@ -105,10 +105,13 @@ createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
 /// This allows the application of  different transformations on send and
 /// compute (for eg.  / different shifts/delays)
 ///
-/// Returns nullptr if none of the operands were the result of an affine_apply
-/// and thus there was no affine computation slice to create. Returns the newly
-/// affine_apply operation instruction otherwise.
-OperationInst *createAffineComputationSlice(OperationInst *opInst);
+/// Returns nullptr either if none of opInst's operands were the result of an
+/// affine_apply (i.e., there was no affine computation slice to create), or if
+/// all the affine_apply op's supplying operands to this opInst did not have any
+/// uses other than those in this opInst. The method otherwise returns the list
+/// of affine_apply operations created in output argument `sliceOps`.
+void createAffineComputationSlice(
+    OperationInst *opInst, SmallVectorImpl<OpPointer<AffineApplyOp>> *sliceOps);
 
 /// Folds the lower and upper bounds of a 'for' inst to constants if possible.
 /// Returns false if the folding happens for at least one bound, true otherwise.
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index 9e7c928070f..101a00eaf61 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -325,8 +325,12 @@ PassResult PipelineDataTransfer::runOnForInst(ForInst *forInst) {
     assert(dmaStartInst->isa<DmaStartOp>());
     instShiftMap[dmaStartInst] = 0;
     // Set shifts for DMA start inst's affine operand computation slices to 0.
-    if (auto *slice = mlir::createAffineComputationSlice(dmaStartInst)) {
-      instShiftMap[slice] = 0;
+    SmallVector<OpPointer<AffineApplyOp>, 4> sliceOps;
+    mlir::createAffineComputationSlice(dmaStartInst, &sliceOps);
+    if (!sliceOps.empty()) {
+      for (auto sliceOp : sliceOps) {
+        instShiftMap[sliceOp->getInstruction()] = 0;
+      }
     } else {
       // If a slice wasn't created, the reachable affine_apply op's from its
       // operands are the ones that go with it.
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
index 0c5581cb455..4101a07a33d 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@@ -170,10 +170,10 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
   return true;
 }
 
-/// Given an operation instruction, inserts a new single affine apply operation,
-/// that is exclusively used by this operation instruction, and that provides
-/// all operands that are results of an affine_apply as a function of loop
-/// iterators and program parameters and whose results are.
+/// Given an operation instruction, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// instruction. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
 ///
 /// Before
 ///
@@ -195,10 +195,12 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
 ///
 /// Returns nullptr either if none of opInst's operands were the result of an
 /// affine_apply and thus there was no affine computation slice to create, or if
-/// all the affine_apply op's supplying operands to this opInst do not have any
-/// uses besides this opInst. Returns the new affine_apply operation instruction
-/// otherwise.
-OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
+/// all the affine_apply op's supplying operands to this opInst did not have any
+/// uses besides this opInst; otherwise returns the list of affine_apply
+/// operations created in output argument `sliceOps`.
+void mlir::createAffineComputationSlice(
+    OperationInst *opInst,
+    SmallVectorImpl<OpPointer<AffineApplyOp>> *sliceOps) {
   // Collect all operands that are results of affine apply ops.
   SmallVector<Value *, 4> subOperands;
   subOperands.reserve(opInst->getNumOperands());
@@ -214,7 +216,7 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
   getReachableAffineApplyOps(subOperands, affineApplyOps);
   // Skip transforming if there are no affine maps to compose.
   if (affineApplyOps.empty())
-    return nullptr;
+    return;
 
   // Check if all uses of the affine apply op's lie only in this op inst, in
   // which case there would be nothing to do.
@@ -230,19 +232,26 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
     }
   }
   if (localized)
-    return nullptr;
+    return;
 
   FuncBuilder builder(opInst);
   SmallVector<Value *, 4> composedOpOperands(subOperands);
-  auto map = builder.getMultiDimIdentityMap(composedOpOperands.size());
-  fullyComposeAffineMapAndOperands(&map, &composedOpOperands);
-  auto affineApply =
-      builder.create<AffineApplyOp>(opInst->getLoc(), map, composedOpOperands);
+  auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
+  fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
+
+  // Create an affine_apply for each of the map results.
+  sliceOps->reserve(composedMap.getNumResults());
+  for (auto resultExpr : composedMap.getResults()) {
+    auto singleResMap = builder.getAffineMap(
+        composedMap.getNumDims(), composedMap.getNumSymbols(), resultExpr, {});
+    sliceOps->push_back(builder.create<AffineApplyOp>(
+        opInst->getLoc(), singleResMap, composedOpOperands));
+  }
 
   // Construct the new operands that include the results from the composed
   // affine apply op above instead of existing ones (subOperands). So, they
   // differ from opInst's operands only for those operands in 'subOperands', for
-  // which they will be replaced by the corresponding one from 'results'.
+  // which they will be replaced by the corresponding one from 'sliceOps'.
   SmallVector<Value *, 4> newOperands(opInst->getOperands());
   for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
     // Replace the subOperands from among the new operands.
@@ -252,15 +261,12 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
         break;
     }
     if (j < subOperands.size()) {
-      newOperands[i] = affineApply->getResult(j);
+      newOperands[i] = (*sliceOps)[j]->getResult(0);
     }
   }
-
   for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
     opInst->setOperand(idx, newOperands[idx]);
   }
-
-  return affineApply->getInstruction();
 }
 
 /// Folds the specified (lower or upper) bound to a constant if possible
diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir
index ad420dfb6d6..e303cd532cf 100644
--- a/mlir/test/Transforms/pipeline-data-transfer.mlir
+++ b/mlir/test/Transforms/pipeline-data-transfer.mlir
@@ -1,41 +1,11 @@
 // RUN: mlir-opt %s -pipeline-data-transfer | FileCheck %s
 
-// CHECK-DAG: [[FLOOR_MOD_2_2D:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2, (d0 floordiv 4) mod 2)
-// CHECK-DAG: [[FLOOR_MOD_2:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2)
 // CHECK-DAG: [[MOD_2:#map[0-9]+]] = (d0) -> (d0 mod 2)
+// CHECK-DAG: [[FLOOR_MOD_2:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2)
 // CHECK-DAG: [[REMAP_SHIFT_MINUS_4:#map[0-9]+]] = (d0) -> (d0 - 4)
 
 // CHECK-LABEL: func @loop_nest_dma() {
 func @loop_nest_dma() {
-// CHECK:        %0 = alloc() : memref<256xf32>
-// CHECK:        %1 = alloc() : memref<2x32xf32, 1>
-// CHECK:        %2 = alloc() : memref<2x1xf32>
-// CHECK:        dma_start %0[%c0], %1[%3#0, %c0], %c128, %2[%3#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-// CHECK-NEXT:   for %i0 = 1 to 8 {
-// CHECK-NEXT:     %4 = affine_apply #map0(%i0)
-// CHECK-NEXT:     dma_start %0[%i0], %1[%4#0, %i0], %c128, %2[%4#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-// CHECK-NEXT:     %5 = affine_apply #map1(%i0)
-// CHECK-NEXT:     %6 = affine_apply #map2(%5)
-// CHECK-NEXT:     %7 = affine_apply #map2(%5)
-// CHECK-NEXT:     dma_wait %2[%6, %c0_0], %c128 : memref<2x1xf32>
-// CHECK-NEXT:     %8 = load %1[%7, %5] : memref<2x32xf32, 1>
-// CHECK-NEXT:     %9 = "compute"(%8) : (f32) -> f32
-// CHECK-NEXT:     store %9, %1[%7, %5] : memref<2x32xf32, 1>
-// CHECK-NEXT:     for %i1 = 0 to 128 {
-// CHECK-NEXT:       "do_more_compute"(%5, %i1) : (index, index) -> ()
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT:   %10 = affine_apply #map1(%c8)
-// CHECK-NEXT:   %11 = affine_apply #map2(%10)
-// CHECK-NEXT:   %12 = affine_apply #map2(%10)
-// CHECK-NEXT:   dma_wait %2[%11, %c0_0], %c128 : memref<2x1xf32>
-// CHECK-NEXT:   %13 = load %1[%12, %10] : memref<2x32xf32, 1>
-// CHECK-NEXT:   %14 = "compute"(%13) : (f32) -> f32
-// CHECK-NEXT:   store %14, %1[%12, %10] : memref<2x32xf32, 1>
-// CHECK-NEXT:   for %i2 = 0 to 128 {
-// CHECK-NEXT:     "do_more_compute"(%10, %i2) : (index, index) -> ()
-// CHECK-NEXT:   }
-// CHECK-NEXT:   return
 
   %A = alloc() : memref<256 x f32, (d0) -> (d0), 0>
   %Ah = alloc() : memref<32 x f32, (d0) -> (d0), 1>
@@ -57,6 +27,40 @@ func @loop_nest_dma() {
   }
   return
 }
+// CHECK:       %0 = alloc() : memref<256xf32>
+// CHECK:       %1 = alloc() : memref<2x32xf32, 1>
+// CHECK-NEXT:  %2 = alloc() : memref<2x1xf32>
+// CHECK-NEXT:  %3 = affine_apply [[MOD_2]](%c0)
+// CHECK-NEXT:  %4 = affine_apply [[MOD_2]](%c0)
+// CHECK-NEXT:  dma_start %0[%c0], %1[%3, %c0], %c128, %2[%4, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+// CHECK-NEXT:  for %i0 = 1 to 8 {
+// CHECK-NEXT:    %5 = affine_apply [[MOD_2]](%i0)
+// CHECK-NEXT:    %6 = affine_apply [[MOD_2]](%i0)
+// CHECK-NEXT:    dma_start %0[%i0], %1[%5, %i0], %c128, %2[%6, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+// CHECK-NEXT:    %7 = affine_apply #map1(%i0)
+// CHECK-NEXT:    %8 = affine_apply [[MOD_2]](%7)
+// CHECK-NEXT:    %9 = affine_apply [[MOD_2]](%7)
+// CHECK-NEXT:    dma_wait %2[%8, %c0_0], %c128 : memref<2x1xf32>
+// CHECK-NEXT:    %10 = load %1[%9, %7] : memref<2x32xf32, 1>
+// CHECK-NEXT:    %11 = "compute"(%10) : (f32) -> f32
+// CHECK-NEXT:    store %11, %1[%9, %7] : memref<2x32xf32, 1>
+// CHECK-NEXT:    for %i1 = 0 to 128 {
+// CHECK-NEXT:      "do_more_compute"(%7, %i1) : (index, index) -> ()
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %12 = affine_apply #map1(%c8)
+// CHECK-NEXT:  %13 = affine_apply [[MOD_2]](%12)
+// CHECK-NEXT:  %14 = affine_apply [[MOD_2]](%12)
+// CHECK-NEXT:  dma_wait %2[%13, %c0_0], %c128 : memref<2x1xf32>
+// CHECK-NEXT:  %15 = load %1[%14, %12] : memref<2x32xf32, 1>
+// CHECK-NEXT:  %16 = "compute"(%15) : (f32) -> f32
+// CHECK-NEXT:  store %16, %1[%14, %12] : memref<2x32xf32, 1>
+// CHECK-NEXT:  for %i2 = 0 to 128 {
+// CHECK-NEXT:    "do_more_compute"(%12, %i2) : (index, index) -> ()
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+// CHECK-NEXT:}
+
 
 // CHECK-LABEL: @loop_step
 func @loop_step(%arg0: memref<512xf32>,
@@ -74,27 +78,29 @@ func @loop_step(%arg0: memref<512xf32>,
   return
 }
 // CHECK:        [[TAG:%[0-9]+]] = alloc() : memref<2x1xi32>
-// CHECK:        %2 = affine_apply [[FLOOR_MOD_2_2D]](%c0)
-// CHECK-NEXT:   dma_start %arg0[%c0], %0[%2#0, %c0_0], %c4, [[TAG]][%2#1, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
+// CHECK:        %2 = affine_apply [[FLOOR_MOD_2]](%c0)
+// CHECK:        %3 = affine_apply [[FLOOR_MOD_2]](%c0)
+// CHECK-NEXT:   dma_start %arg0[%c0], %0[%2, %c0_0], %c4, [[TAG]][%3, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
 // CHECK-NEXT:   for %i0 = 4 to 512 step 4 {
-// CHECK-NEXT:     %3 = affine_apply [[FLOOR_MOD_2_2D]](%i0)
-// CHECK-NEXT:     dma_start %arg0[%i0], %0[%3#0, %c0_0], %c4, [[TAG]][%3#1, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
-// CHECK-NEXT:     %4 = affine_apply [[REMAP_SHIFT_MINUS_4]](%i0)
-// CHECK-NEXT:     %5 = affine_apply [[FLOOR_MOD_2]](%4)
-// CHECK:          dma_wait [[TAG]][%5, %c0_0], %c4 : memref<2x1xi32>
-// CHECK-NEXT:     "compute"(%4) : (index) -> ()
+// CHECK-NEXT:     %4 = affine_apply [[FLOOR_MOD_2]](%i0)
+// CHECK-NEXT:     %5 = affine_apply [[FLOOR_MOD_2]](%i0)
+// CHECK-NEXT:     dma_start %arg0[%i0], %0[%4, %c0_0], %c4, [[TAG]][%5, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
+// CHECK-NEXT:     %6 = affine_apply [[REMAP_SHIFT_MINUS_4]](%i0)
+// CHECK-NEXT:     %7 = affine_apply [[FLOOR_MOD_2]](%6)
+// CHECK:          dma_wait [[TAG]][%7, %c0_0], %c4 : memref<2x1xi32>
+// CHECK-NEXT:     "compute"(%6) : (index) -> ()
 // CHECK-NEXT:   }
 // CHECK-NEXT:   [[SHIFTED:%[0-9]+]] = affine_apply [[REMAP_SHIFT_MINUS_4]](%c512)
-// CHECK-NEXT:   %8 = affine_apply [[FLOOR_MOD_2]]([[SHIFTED]])
-// CHECK:        dma_wait [[TAG]][%8, %c0_0], %c4 : memref<2x1xi32>
-// CHECK-NEXT:   "compute"(%7) : (index) -> ()
+// CHECK-NEXT:   %10 = affine_apply [[FLOOR_MOD_2]]([[SHIFTED]])
+// CHECK:        dma_wait [[TAG]][%10, %c0_0], %c4 : memref<2x1xi32>
+// CHECK-NEXT:   "compute"(%9) : (index) -> ()
 // CHECK-NEXT:   return
 // CHECK-NEXT: }
 
 #map0 = (d0, d1) -> (d0, d1)
 #map1 = (d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32, 0)
-#map2 = (d0) -> ((d0 * 2048) floordiv 32, 0)
-// CHECK: func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>
+#map2 = (d0) -> ((d0 * 2048) floordiv 32)
+// CHECK-LABEL: func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>
 func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>, #map0>, %arg1: memref<512x32xvector<8xf32>, #map0>, %arg2: memref<512x32xvector<8xf32>, #map0>) {
   %num_elts = constant 256 : index
   %c0 = constant 0 : index
@@ -105,58 +111,63 @@ func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>, #map0>, %arg1: memref<
   %4 = alloc() : memref<2xi32>
   %5 = alloc() : memref<2xi32>
   // Prologue for DMA overlap on arg2.
+  // CHECK:[[TAG_ARG2:%[0-9]+]] = alloc() : memref<2x2xi32>
   // CHECK: dma_start %arg2[
-  // CHECK-NEXT: for %i0 = 1 to 8 {
+  // CHECK: for %i0 = 1 to 8 {
   for %i0 = 0 to 8 {
     %6 = affine_apply #map2(%i0)
-    dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
+    dma_start %arg2[%6, %c0], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
     dma_wait %5[%c0], %num_elts : memref<2xi32>
     // Steady state for DMA overlap on arg2
     // CHECK: dma_start %arg2[
-    // CHECK: dma_wait %1[
+    // CHECK: dma_wait [[TAG_ARG2]]
     // Prologue for DMA overlap on arg0, arg1 nested within i0
+    // CHECK: [[TAG_ARG0:%[0-9]+]] = alloc() : memref<2x2xi32>
+    // CHECK: [[TAG_ARG1:%[0-9]+]] = alloc() : memref<2x2xi32>
     // CHECK: dma_start %arg0[
     // CHECK: dma_start %arg1[
     // CHECK-NEXT for %i1 = 1 to 8 {
     for %i1 = 0 to 8 {
       %7 = affine_apply #map1(%i0, %i1)
       %8 = affine_apply #map2(%i1)
-      dma_start %arg0[%7#0, %7#1], %0[%c0, %c0], %num_elts, %3[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
-      dma_start %arg1[%8#0, %8#1], %1[%c0, %c0], %num_elts, %4[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
+      dma_start %arg0[%7, %c0], %0[%c0, %c0], %num_elts, %3[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
+      dma_start %arg1[%8, %c0], %1[%c0, %c0], %num_elts, %4[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
       dma_wait %3[%c0], %num_elts : memref<2xi32>
       dma_wait %4[%c0], %num_elts : memref<2xi32>
       // Steady state for DMA overlap on arg0, arg1
       // CHECK: dma_start %arg0[
       // CHECK: dma_start %arg1[
-      // CHECK: dma_wait %10[
-      // CHECK: dma_wait %11[
+      // CHECK: dma_wait [[TAG_ARG0]]
+      // CHECK: dma_wait [[TAG_ARG1]]
       // CHECK-NEXT: for %i2 = 0 to 4 {
       for %i2 = 0 to 4 {
         "foo"() : () -> ()
       }
     }
     // epilogue for arg0, arg1
-    // CHECK: dma_wait %10[
-    // CHECK: dma_wait %11[
-
-    // epilogue for DMA overlap on %arg2
-    // CHECK:  dma_wait %1[%31, %c0_2], %c256 : memref<2x2xi32>
-    // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
-    // CHECK:  dma_start %arg0[
-    // CHECK:  dma_start %arg1[
-    // CHECK:  for %i4 = 1 to 8 {
-    // CHECK:    dma_start %arg0[
-    // CHECK:    dma_start %arg1[
-    // CHECK:    dma_wait %36[
-    // CHECK:    dma_wait %37[
-    // CHECK:    for %i5 = 0 to 4 {
-    // CHECK:      "foo"() : () -> ()
-    // CHECK:  dma_wait %36[
-    // CHECK:  dma_wait %37[
-    // CHECK:  for %i6 = 0 to 4 {
-
-  } // CHECK: }
-  return // CHECK-NEXT: return
+    // CHECK: dma_wait [[TAG_ARG0]]
+    // CHECK: dma_wait [[TAG_ARG1]]
+  // epilogue for DMA overlap on %arg2
+  // CHECK:  dma_wait [[TAG_ARG2]]
+  // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
+  // CHECK: [[TAG_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
+  // CHECK: [[TAG_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
+  // CHECK:  dma_start %arg0[
+  // CHECK:  dma_start %arg1[
+  // CHECK:  for %i4 = 1 to 8 {
+  // CHECK:    dma_start %arg0[
+  // CHECK:    dma_start %arg1[
+  // CHECK:    dma_wait [[TAG_ARG0_NESTED]]
+  // CHECK:    dma_wait [[TAG_ARG1_NESTED]]
+  // CHECK:    for %i5 = 0 to 4 {
+  // CHECK:      "foo"() : () -> ()
+  // CHECK:  dma_wait [[TAG_ARG0_NESTED]]
+  // CHECK:  dma_wait [[TAG_ARG1_NESTED]]
+  // CHECK:  for %i6 = 0 to 4 {
+  }
+  return
+// CHECK: }
+// CHECK-NEXT: return
 }
 
 // CHECK: func @loop_dma_dependent
@@ -176,10 +187,10 @@ func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) {
   // CHECK: for %i0 = 0 to 8 {
   for %i0 = 0 to 8 {
     %6 = affine_apply #map2(%i0)
-    dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
+    dma_start %arg2[%6, %c0], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
     dma_wait %5[%c0], %num_elts : memref<2xi32>
 
-    dma_start %2[%c0, %c0], %arg2[%6#0, %6#1], %num_elts, %5[%c0] : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
+    dma_start %2[%c0, %c0], %arg2[%6, %c0], %num_elts, %5[%c0] : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
     dma_wait %5[%c0], %num_elts : memref<2xi32>
   } // CHECK: }
   return // CHECK-NEXT: return
@@ -246,9 +257,9 @@ func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) {
 // CHECK-NEXT:  %1 = dim %0, 0 : memref<?x?xf32, 2>
 // CHECK-NEXT:  %2 = dim %0, 1 : memref<?x?xf32, 2>
 // CHECK-NEXT:  %3 = alloc(%1, %2) : memref<2x?x?xf32, 2>
-
-// CHECK:  dma_start %arg0[%c0_0, %c0_0], %3[%5#0, %c0_0, %c0_0],
-// CHECK-NEXT:  for %i0 = 1 to 16 {
+// CHECK:       %5 = affine_apply [[MOD_2]](%c0)
+// CHECK:       %6 = affine_apply [[MOD_2]](%c0)
+// CHECK:       dma_start %arg0[%c0_0, %c0_0], %3[%5, %c0_0, %c0_0], %c512, %4[%6, %c0_0]
   for %kTT = 0 to 16 {
     dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %num_elt, %tag[%zero] :
       memref<512 x 32 x f32>,
@@ -256,9 +267,12 @@ func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) {
     dma_wait %tag[%zero], %num_elt : memref<1 x i32>
   }
   return
-// CHECK:          dma_start %arg0[%c0_0, %c0_0], %3[%6#0, %c0_0, %c0_0], %c512, %4[%6#1, %c0_0]
-// CHECK:          dma_wait %4[%8, %c0_0], %c512 : memref<2x1xi32>
+// CHECK-NEXT:  for %i0 = 1 to 16 {
+// CHECK:         %7 = affine_apply [[MOD_2]](%i0)
+// CHECK:         %8 = affine_apply [[MOD_2]](%i0)
+// CHECK:         dma_start %arg0[%c0_0, %c0_0], %3[%7, %c0_0, %c0_0], %c512, %4[%8, %c0_0]
+// CHECK:         dma_wait %4[%10, %c0_0], %c512 : memref<2x1xi32>
 // CHECK:       }
-// CHECK:       dma_wait %4[%11, %c0_0], %c512 : memref<2x1xi32>
+// CHECK:       dma_wait %4[%13, %c0_0], %c512 : memref<2x1xi32>
 // CHECK-NEXT:  return
 }
author	Uday Bondhugula <bondhugula@google.com>	2019-01-25 14:06:32 -0800
committer	jpienaar <jpienaar@google.com>	2019-03-29 15:37:53 -0700
commit	b588d58c5f2c8c570600210050bd9beaed0c3a24 (patch)
tree	91c0072ea3bbe731d932c532c656cebc34c68599
parent	c3424c3c7526641a08c993ab881c1d4e237bfddb (diff)
download	bcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.tar.gz bcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.zip