summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorUday Bondhugula <bondhugula@google.com>2019-01-25 14:06:32 -0800
committerjpienaar <jpienaar@google.com>2019-03-29 15:37:53 -0700
commitb588d58c5f2c8c570600210050bd9beaed0c3a24 (patch)
tree91c0072ea3bbe731d932c532c656cebc34c68599
parentc3424c3c7526641a08c993ab881c1d4e237bfddb (diff)
downloadbcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.tar.gz
bcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.zip
Update createAffineComputationSlice to generate single result affine maps
- Update createAffineComputationSlice to generate a sequence of single result affine apply ops instead of one multi-result affine apply - update pipeline-data-transfer test case; while on this, also update the test case to use only single result affine maps, and make it more robust to change. PiperOrigin-RevId: 230965478
-rw-r--r--mlir/include/mlir/Transforms/Utils.h19
-rw-r--r--mlir/lib/Transforms/PipelineDataTransfer.cpp8
-rw-r--r--mlir/lib/Transforms/Utils/Utils.cpp44
-rw-r--r--mlir/test/Transforms/pipeline-data-transfer.mlir174
4 files changed, 136 insertions, 109 deletions
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h
index 2a1505aecb7..b3fe4471699 100644
--- a/mlir/include/mlir/Transforms/Utils.h
+++ b/mlir/include/mlir/Transforms/Utils.h
@@ -82,10 +82,10 @@ createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
ArrayRef<OperationInst *> affineApplyOps,
SmallVectorImpl<Value *> *results);
-/// Given an operation instruction, inserts a new single affine apply operation,
-/// that is exclusively used by this operation instruction, and that provides
-/// all operands that are results of an affine_apply as a function of loop
-/// iterators and program parameters and whose results are.
+/// Given an operation instruction, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// instruction. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
///
/// Before
///
@@ -105,10 +105,13 @@ createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
/// This allows the application of different transformations on send and
/// compute (for eg. / different shifts/delays)
///
-/// Returns nullptr if none of the operands were the result of an affine_apply
-/// and thus there was no affine computation slice to create. Returns the newly
-/// affine_apply operation instruction otherwise.
-OperationInst *createAffineComputationSlice(OperationInst *opInst);
+/// Returns nullptr either if none of opInst's operands were the result of an
+/// affine_apply (i.e., there was no affine computation slice to create), or if
+/// all the affine_apply op's supplying operands to this opInst did not have any
+/// uses other than those in this opInst. The method otherwise returns the list
+/// of affine_apply operations created in output argument `sliceOps`.
+void createAffineComputationSlice(
+ OperationInst *opInst, SmallVectorImpl<OpPointer<AffineApplyOp>> *sliceOps);
/// Folds the lower and upper bounds of a 'for' inst to constants if possible.
/// Returns false if the folding happens for at least one bound, true otherwise.
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index 9e7c928070f..101a00eaf61 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -325,8 +325,12 @@ PassResult PipelineDataTransfer::runOnForInst(ForInst *forInst) {
assert(dmaStartInst->isa<DmaStartOp>());
instShiftMap[dmaStartInst] = 0;
// Set shifts for DMA start inst's affine operand computation slices to 0.
- if (auto *slice = mlir::createAffineComputationSlice(dmaStartInst)) {
- instShiftMap[slice] = 0;
+ SmallVector<OpPointer<AffineApplyOp>, 4> sliceOps;
+ mlir::createAffineComputationSlice(dmaStartInst, &sliceOps);
+ if (!sliceOps.empty()) {
+ for (auto sliceOp : sliceOps) {
+ instShiftMap[sliceOp->getInstruction()] = 0;
+ }
} else {
// If a slice wasn't created, the reachable affine_apply op's from its
// operands are the ones that go with it.
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
index 0c5581cb455..4101a07a33d 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@@ -170,10 +170,10 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
return true;
}
-/// Given an operation instruction, inserts a new single affine apply operation,
-/// that is exclusively used by this operation instruction, and that provides
-/// all operands that are results of an affine_apply as a function of loop
-/// iterators and program parameters and whose results are.
+/// Given an operation instruction, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// instruction. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
///
/// Before
///
@@ -195,10 +195,12 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
///
/// Returns nullptr either if none of opInst's operands were the result of an
/// affine_apply and thus there was no affine computation slice to create, or if
-/// all the affine_apply op's supplying operands to this opInst do not have any
-/// uses besides this opInst. Returns the new affine_apply operation instruction
-/// otherwise.
-OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
+/// all the affine_apply op's supplying operands to this opInst did not have any
+/// uses besides this opInst; otherwise returns the list of affine_apply
+/// operations created in output argument `sliceOps`.
+void mlir::createAffineComputationSlice(
+ OperationInst *opInst,
+ SmallVectorImpl<OpPointer<AffineApplyOp>> *sliceOps) {
// Collect all operands that are results of affine apply ops.
SmallVector<Value *, 4> subOperands;
subOperands.reserve(opInst->getNumOperands());
@@ -214,7 +216,7 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
getReachableAffineApplyOps(subOperands, affineApplyOps);
// Skip transforming if there are no affine maps to compose.
if (affineApplyOps.empty())
- return nullptr;
+ return;
// Check if all uses of the affine apply op's lie only in this op inst, in
// which case there would be nothing to do.
@@ -230,19 +232,26 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
}
}
if (localized)
- return nullptr;
+ return;
FuncBuilder builder(opInst);
SmallVector<Value *, 4> composedOpOperands(subOperands);
- auto map = builder.getMultiDimIdentityMap(composedOpOperands.size());
- fullyComposeAffineMapAndOperands(&map, &composedOpOperands);
- auto affineApply =
- builder.create<AffineApplyOp>(opInst->getLoc(), map, composedOpOperands);
+ auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
+ fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
+
+ // Create an affine_apply for each of the map results.
+ sliceOps->reserve(composedMap.getNumResults());
+ for (auto resultExpr : composedMap.getResults()) {
+ auto singleResMap = builder.getAffineMap(
+ composedMap.getNumDims(), composedMap.getNumSymbols(), resultExpr, {});
+ sliceOps->push_back(builder.create<AffineApplyOp>(
+ opInst->getLoc(), singleResMap, composedOpOperands));
+ }
// Construct the new operands that include the results from the composed
// affine apply op above instead of existing ones (subOperands). So, they
// differ from opInst's operands only for those operands in 'subOperands', for
- // which they will be replaced by the corresponding one from 'results'.
+ // which they will be replaced by the corresponding one from 'sliceOps'.
SmallVector<Value *, 4> newOperands(opInst->getOperands());
for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
// Replace the subOperands from among the new operands.
@@ -252,15 +261,12 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) {
break;
}
if (j < subOperands.size()) {
- newOperands[i] = affineApply->getResult(j);
+ newOperands[i] = (*sliceOps)[j]->getResult(0);
}
}
-
for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
opInst->setOperand(idx, newOperands[idx]);
}
-
- return affineApply->getInstruction();
}
/// Folds the specified (lower or upper) bound to a constant if possible
diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir
index ad420dfb6d6..e303cd532cf 100644
--- a/mlir/test/Transforms/pipeline-data-transfer.mlir
+++ b/mlir/test/Transforms/pipeline-data-transfer.mlir
@@ -1,41 +1,11 @@
// RUN: mlir-opt %s -pipeline-data-transfer | FileCheck %s
-// CHECK-DAG: [[FLOOR_MOD_2_2D:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2, (d0 floordiv 4) mod 2)
-// CHECK-DAG: [[FLOOR_MOD_2:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2)
// CHECK-DAG: [[MOD_2:#map[0-9]+]] = (d0) -> (d0 mod 2)
+// CHECK-DAG: [[FLOOR_MOD_2:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2)
// CHECK-DAG: [[REMAP_SHIFT_MINUS_4:#map[0-9]+]] = (d0) -> (d0 - 4)
// CHECK-LABEL: func @loop_nest_dma() {
func @loop_nest_dma() {
-// CHECK: %0 = alloc() : memref<256xf32>
-// CHECK: %1 = alloc() : memref<2x32xf32, 1>
-// CHECK: %2 = alloc() : memref<2x1xf32>
-// CHECK: dma_start %0[%c0], %1[%3#0, %c0], %c128, %2[%3#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-// CHECK-NEXT: for %i0 = 1 to 8 {
-// CHECK-NEXT: %4 = affine_apply #map0(%i0)
-// CHECK-NEXT: dma_start %0[%i0], %1[%4#0, %i0], %c128, %2[%4#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
-// CHECK-NEXT: %5 = affine_apply #map1(%i0)
-// CHECK-NEXT: %6 = affine_apply #map2(%5)
-// CHECK-NEXT: %7 = affine_apply #map2(%5)
-// CHECK-NEXT: dma_wait %2[%6, %c0_0], %c128 : memref<2x1xf32>
-// CHECK-NEXT: %8 = load %1[%7, %5] : memref<2x32xf32, 1>
-// CHECK-NEXT: %9 = "compute"(%8) : (f32) -> f32
-// CHECK-NEXT: store %9, %1[%7, %5] : memref<2x32xf32, 1>
-// CHECK-NEXT: for %i1 = 0 to 128 {
-// CHECK-NEXT: "do_more_compute"(%5, %i1) : (index, index) -> ()
-// CHECK-NEXT: }
-// CHECK-NEXT: }
-// CHECK-NEXT: %10 = affine_apply #map1(%c8)
-// CHECK-NEXT: %11 = affine_apply #map2(%10)
-// CHECK-NEXT: %12 = affine_apply #map2(%10)
-// CHECK-NEXT: dma_wait %2[%11, %c0_0], %c128 : memref<2x1xf32>
-// CHECK-NEXT: %13 = load %1[%12, %10] : memref<2x32xf32, 1>
-// CHECK-NEXT: %14 = "compute"(%13) : (f32) -> f32
-// CHECK-NEXT: store %14, %1[%12, %10] : memref<2x32xf32, 1>
-// CHECK-NEXT: for %i2 = 0 to 128 {
-// CHECK-NEXT: "do_more_compute"(%10, %i2) : (index, index) -> ()
-// CHECK-NEXT: }
-// CHECK-NEXT: return
%A = alloc() : memref<256 x f32, (d0) -> (d0), 0>
%Ah = alloc() : memref<32 x f32, (d0) -> (d0), 1>
@@ -57,6 +27,40 @@ func @loop_nest_dma() {
}
return
}
+// CHECK: %0 = alloc() : memref<256xf32>
+// CHECK: %1 = alloc() : memref<2x32xf32, 1>
+// CHECK-NEXT: %2 = alloc() : memref<2x1xf32>
+// CHECK-NEXT: %3 = affine_apply [[MOD_2]](%c0)
+// CHECK-NEXT: %4 = affine_apply [[MOD_2]](%c0)
+// CHECK-NEXT: dma_start %0[%c0], %1[%3, %c0], %c128, %2[%4, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+// CHECK-NEXT: for %i0 = 1 to 8 {
+// CHECK-NEXT: %5 = affine_apply [[MOD_2]](%i0)
+// CHECK-NEXT: %6 = affine_apply [[MOD_2]](%i0)
+// CHECK-NEXT: dma_start %0[%i0], %1[%5, %i0], %c128, %2[%6, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+// CHECK-NEXT: %7 = affine_apply #map1(%i0)
+// CHECK-NEXT: %8 = affine_apply [[MOD_2]](%7)
+// CHECK-NEXT: %9 = affine_apply [[MOD_2]](%7)
+// CHECK-NEXT: dma_wait %2[%8, %c0_0], %c128 : memref<2x1xf32>
+// CHECK-NEXT: %10 = load %1[%9, %7] : memref<2x32xf32, 1>
+// CHECK-NEXT: %11 = "compute"(%10) : (f32) -> f32
+// CHECK-NEXT: store %11, %1[%9, %7] : memref<2x32xf32, 1>
+// CHECK-NEXT: for %i1 = 0 to 128 {
+// CHECK-NEXT: "do_more_compute"(%7, %i1) : (index, index) -> ()
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: %12 = affine_apply #map1(%c8)
+// CHECK-NEXT: %13 = affine_apply [[MOD_2]](%12)
+// CHECK-NEXT: %14 = affine_apply [[MOD_2]](%12)
+// CHECK-NEXT: dma_wait %2[%13, %c0_0], %c128 : memref<2x1xf32>
+// CHECK-NEXT: %15 = load %1[%14, %12] : memref<2x32xf32, 1>
+// CHECK-NEXT: %16 = "compute"(%15) : (f32) -> f32
+// CHECK-NEXT: store %16, %1[%14, %12] : memref<2x32xf32, 1>
+// CHECK-NEXT: for %i2 = 0 to 128 {
+// CHECK-NEXT: "do_more_compute"(%12, %i2) : (index, index) -> ()
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+// CHECK-NEXT:}
+
// CHECK-LABEL: @loop_step
func @loop_step(%arg0: memref<512xf32>,
@@ -74,27 +78,29 @@ func @loop_step(%arg0: memref<512xf32>,
return
}
// CHECK: [[TAG:%[0-9]+]] = alloc() : memref<2x1xi32>
-// CHECK: %2 = affine_apply [[FLOOR_MOD_2_2D]](%c0)
-// CHECK-NEXT: dma_start %arg0[%c0], %0[%2#0, %c0_0], %c4, [[TAG]][%2#1, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
+// CHECK: %2 = affine_apply [[FLOOR_MOD_2]](%c0)
+// CHECK: %3 = affine_apply [[FLOOR_MOD_2]](%c0)
+// CHECK-NEXT: dma_start %arg0[%c0], %0[%2, %c0_0], %c4, [[TAG]][%3, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
// CHECK-NEXT: for %i0 = 4 to 512 step 4 {
-// CHECK-NEXT: %3 = affine_apply [[FLOOR_MOD_2_2D]](%i0)
-// CHECK-NEXT: dma_start %arg0[%i0], %0[%3#0, %c0_0], %c4, [[TAG]][%3#1, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
-// CHECK-NEXT: %4 = affine_apply [[REMAP_SHIFT_MINUS_4]](%i0)
-// CHECK-NEXT: %5 = affine_apply [[FLOOR_MOD_2]](%4)
-// CHECK: dma_wait [[TAG]][%5, %c0_0], %c4 : memref<2x1xi32>
-// CHECK-NEXT: "compute"(%4) : (index) -> ()
+// CHECK-NEXT: %4 = affine_apply [[FLOOR_MOD_2]](%i0)
+// CHECK-NEXT: %5 = affine_apply [[FLOOR_MOD_2]](%i0)
+// CHECK-NEXT: dma_start %arg0[%i0], %0[%4, %c0_0], %c4, [[TAG]][%5, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
+// CHECK-NEXT: %6 = affine_apply [[REMAP_SHIFT_MINUS_4]](%i0)
+// CHECK-NEXT: %7 = affine_apply [[FLOOR_MOD_2]](%6)
+// CHECK: dma_wait [[TAG]][%7, %c0_0], %c4 : memref<2x1xi32>
+// CHECK-NEXT: "compute"(%6) : (index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: [[SHIFTED:%[0-9]+]] = affine_apply [[REMAP_SHIFT_MINUS_4]](%c512)
-// CHECK-NEXT: %8 = affine_apply [[FLOOR_MOD_2]]([[SHIFTED]])
-// CHECK: dma_wait [[TAG]][%8, %c0_0], %c4 : memref<2x1xi32>
-// CHECK-NEXT: "compute"(%7) : (index) -> ()
+// CHECK-NEXT: %10 = affine_apply [[FLOOR_MOD_2]]([[SHIFTED]])
+// CHECK: dma_wait [[TAG]][%10, %c0_0], %c4 : memref<2x1xi32>
+// CHECK-NEXT: "compute"(%9) : (index) -> ()
// CHECK-NEXT: return
// CHECK-NEXT: }
#map0 = (d0, d1) -> (d0, d1)
#map1 = (d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32, 0)
-#map2 = (d0) -> ((d0 * 2048) floordiv 32, 0)
-// CHECK: func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>
+#map2 = (d0) -> ((d0 * 2048) floordiv 32)
+// CHECK-LABEL: func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>
func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>, #map0>, %arg1: memref<512x32xvector<8xf32>, #map0>, %arg2: memref<512x32xvector<8xf32>, #map0>) {
%num_elts = constant 256 : index
%c0 = constant 0 : index
@@ -105,58 +111,63 @@ func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>, #map0>, %arg1: memref<
%4 = alloc() : memref<2xi32>
%5 = alloc() : memref<2xi32>
// Prologue for DMA overlap on arg2.
+ // CHECK:[[TAG_ARG2:%[0-9]+]] = alloc() : memref<2x2xi32>
// CHECK: dma_start %arg2[
- // CHECK-NEXT: for %i0 = 1 to 8 {
+ // CHECK: for %i0 = 1 to 8 {
for %i0 = 0 to 8 {
%6 = affine_apply #map2(%i0)
- dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
+ dma_start %arg2[%6, %c0], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
dma_wait %5[%c0], %num_elts : memref<2xi32>
// Steady state for DMA overlap on arg2
// CHECK: dma_start %arg2[
- // CHECK: dma_wait %1[
+ // CHECK: dma_wait [[TAG_ARG2]]
// Prologue for DMA overlap on arg0, arg1 nested within i0
+ // CHECK: [[TAG_ARG0:%[0-9]+]] = alloc() : memref<2x2xi32>
+ // CHECK: [[TAG_ARG1:%[0-9]+]] = alloc() : memref<2x2xi32>
// CHECK: dma_start %arg0[
// CHECK: dma_start %arg1[
// CHECK-NEXT for %i1 = 1 to 8 {
for %i1 = 0 to 8 {
%7 = affine_apply #map1(%i0, %i1)
%8 = affine_apply #map2(%i1)
- dma_start %arg0[%7#0, %7#1], %0[%c0, %c0], %num_elts, %3[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
- dma_start %arg1[%8#0, %8#1], %1[%c0, %c0], %num_elts, %4[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
+ dma_start %arg0[%7, %c0], %0[%c0, %c0], %num_elts, %3[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
+ dma_start %arg1[%8, %c0], %1[%c0, %c0], %num_elts, %4[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
dma_wait %3[%c0], %num_elts : memref<2xi32>
dma_wait %4[%c0], %num_elts : memref<2xi32>
// Steady state for DMA overlap on arg0, arg1
// CHECK: dma_start %arg0[
// CHECK: dma_start %arg1[
- // CHECK: dma_wait %10[
- // CHECK: dma_wait %11[
+ // CHECK: dma_wait [[TAG_ARG0]]
+ // CHECK: dma_wait [[TAG_ARG1]]
// CHECK-NEXT: for %i2 = 0 to 4 {
for %i2 = 0 to 4 {
"foo"() : () -> ()
}
}
// epilogue for arg0, arg1
- // CHECK: dma_wait %10[
- // CHECK: dma_wait %11[
-
- // epilogue for DMA overlap on %arg2
- // CHECK: dma_wait %1[%31, %c0_2], %c256 : memref<2x2xi32>
- // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
- // CHECK: dma_start %arg0[
- // CHECK: dma_start %arg1[
- // CHECK: for %i4 = 1 to 8 {
- // CHECK: dma_start %arg0[
- // CHECK: dma_start %arg1[
- // CHECK: dma_wait %36[
- // CHECK: dma_wait %37[
- // CHECK: for %i5 = 0 to 4 {
- // CHECK: "foo"() : () -> ()
- // CHECK: dma_wait %36[
- // CHECK: dma_wait %37[
- // CHECK: for %i6 = 0 to 4 {
-
- } // CHECK: }
- return // CHECK-NEXT: return
+ // CHECK: dma_wait [[TAG_ARG0]]
+ // CHECK: dma_wait [[TAG_ARG1]]
+ // epilogue for DMA overlap on %arg2
+ // CHECK: dma_wait [[TAG_ARG2]]
+ // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
+ // CHECK: [[TAG_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
+ // CHECK: [[TAG_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
+ // CHECK: dma_start %arg0[
+ // CHECK: dma_start %arg1[
+ // CHECK: for %i4 = 1 to 8 {
+ // CHECK: dma_start %arg0[
+ // CHECK: dma_start %arg1[
+ // CHECK: dma_wait [[TAG_ARG0_NESTED]]
+ // CHECK: dma_wait [[TAG_ARG1_NESTED]]
+ // CHECK: for %i5 = 0 to 4 {
+ // CHECK: "foo"() : () -> ()
+ // CHECK: dma_wait [[TAG_ARG0_NESTED]]
+ // CHECK: dma_wait [[TAG_ARG1_NESTED]]
+ // CHECK: for %i6 = 0 to 4 {
+ }
+ return
+// CHECK: }
+// CHECK-NEXT: return
}
// CHECK: func @loop_dma_dependent
@@ -176,10 +187,10 @@ func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) {
// CHECK: for %i0 = 0 to 8 {
for %i0 = 0 to 8 {
%6 = affine_apply #map2(%i0)
- dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
+ dma_start %arg2[%6, %c0], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
dma_wait %5[%c0], %num_elts : memref<2xi32>
- dma_start %2[%c0, %c0], %arg2[%6#0, %6#1], %num_elts, %5[%c0] : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
+ dma_start %2[%c0, %c0], %arg2[%6, %c0], %num_elts, %5[%c0] : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
dma_wait %5[%c0], %num_elts : memref<2xi32>
} // CHECK: }
return // CHECK-NEXT: return
@@ -246,9 +257,9 @@ func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) {
// CHECK-NEXT: %1 = dim %0, 0 : memref<?x?xf32, 2>
// CHECK-NEXT: %2 = dim %0, 1 : memref<?x?xf32, 2>
// CHECK-NEXT: %3 = alloc(%1, %2) : memref<2x?x?xf32, 2>
-
-// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%5#0, %c0_0, %c0_0],
-// CHECK-NEXT: for %i0 = 1 to 16 {
+// CHECK: %5 = affine_apply [[MOD_2]](%c0)
+// CHECK: %6 = affine_apply [[MOD_2]](%c0)
+// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%5, %c0_0, %c0_0], %c512, %4[%6, %c0_0]
for %kTT = 0 to 16 {
dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %num_elt, %tag[%zero] :
memref<512 x 32 x f32>,
@@ -256,9 +267,12 @@ func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) {
dma_wait %tag[%zero], %num_elt : memref<1 x i32>
}
return
-// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%6#0, %c0_0, %c0_0], %c512, %4[%6#1, %c0_0]
-// CHECK: dma_wait %4[%8, %c0_0], %c512 : memref<2x1xi32>
+// CHECK-NEXT: for %i0 = 1 to 16 {
+// CHECK: %7 = affine_apply [[MOD_2]](%i0)
+// CHECK: %8 = affine_apply [[MOD_2]](%i0)
+// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%7, %c0_0, %c0_0], %c512, %4[%8, %c0_0]
+// CHECK: dma_wait %4[%10, %c0_0], %c512 : memref<2x1xi32>
// CHECK: }
-// CHECK: dma_wait %4[%11, %c0_0], %c512 : memref<2x1xi32>
+// CHECK: dma_wait %4[%13, %c0_0], %c512 : memref<2x1xi32>
// CHECK-NEXT: return
}
OpenPOWER on IntegriCloud