diff options
| author | Uday Bondhugula <bondhugula@google.com> | 2019-01-25 14:06:32 -0800 |
|---|---|---|
| committer | jpienaar <jpienaar@google.com> | 2019-03-29 15:37:53 -0700 |
| commit | b588d58c5f2c8c570600210050bd9beaed0c3a24 (patch) | |
| tree | 91c0072ea3bbe731d932c532c656cebc34c68599 | |
| parent | c3424c3c7526641a08c993ab881c1d4e237bfddb (diff) | |
| download | bcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.tar.gz bcm5719-llvm-b588d58c5f2c8c570600210050bd9beaed0c3a24.zip | |
Update createAffineComputationSlice to generate single result affine maps
- Update createAffineComputationSlice to generate a sequence of single result
affine apply ops instead of one multi-result affine apply
- update pipeline-data-transfer test case; while on this, also update the test
case to use only single result affine maps, and make it more robust to
change.
PiperOrigin-RevId: 230965478
| -rw-r--r-- | mlir/include/mlir/Transforms/Utils.h | 19 | ||||
| -rw-r--r-- | mlir/lib/Transforms/PipelineDataTransfer.cpp | 8 | ||||
| -rw-r--r-- | mlir/lib/Transforms/Utils/Utils.cpp | 44 | ||||
| -rw-r--r-- | mlir/test/Transforms/pipeline-data-transfer.mlir | 174 |
4 files changed, 136 insertions, 109 deletions
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h index 2a1505aecb7..b3fe4471699 100644 --- a/mlir/include/mlir/Transforms/Utils.h +++ b/mlir/include/mlir/Transforms/Utils.h @@ -82,10 +82,10 @@ createComposedAffineApplyOp(FuncBuilder *builder, Location loc, ArrayRef<OperationInst *> affineApplyOps, SmallVectorImpl<Value *> *results); -/// Given an operation instruction, inserts a new single affine apply operation, -/// that is exclusively used by this operation instruction, and that provides -/// all operands that are results of an affine_apply as a function of loop -/// iterators and program parameters and whose results are. +/// Given an operation instruction, inserts one or more single result affine +/// apply operations, results of which are exclusively used by this operation +/// instruction. The operands of these newly created affine apply ops are +/// guaranteed to be loop iterators or terminal symbols of a function. /// /// Before /// @@ -105,10 +105,13 @@ createComposedAffineApplyOp(FuncBuilder *builder, Location loc, /// This allows the application of different transformations on send and /// compute (for eg. / different shifts/delays) /// -/// Returns nullptr if none of the operands were the result of an affine_apply -/// and thus there was no affine computation slice to create. Returns the newly -/// affine_apply operation instruction otherwise. -OperationInst *createAffineComputationSlice(OperationInst *opInst); +/// Returns nullptr either if none of opInst's operands were the result of an +/// affine_apply (i.e., there was no affine computation slice to create), or if +/// all the affine_apply op's supplying operands to this opInst did not have any +/// uses other than those in this opInst. The method otherwise returns the list +/// of affine_apply operations created in output argument `sliceOps`. +void createAffineComputationSlice( + OperationInst *opInst, SmallVectorImpl<OpPointer<AffineApplyOp>> *sliceOps); /// Folds the lower and upper bounds of a 'for' inst to constants if possible. /// Returns false if the folding happens for at least one bound, true otherwise. diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index 9e7c928070f..101a00eaf61 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -325,8 +325,12 @@ PassResult PipelineDataTransfer::runOnForInst(ForInst *forInst) { assert(dmaStartInst->isa<DmaStartOp>()); instShiftMap[dmaStartInst] = 0; // Set shifts for DMA start inst's affine operand computation slices to 0. - if (auto *slice = mlir::createAffineComputationSlice(dmaStartInst)) { - instShiftMap[slice] = 0; + SmallVector<OpPointer<AffineApplyOp>, 4> sliceOps; + mlir::createAffineComputationSlice(dmaStartInst, &sliceOps); + if (!sliceOps.empty()) { + for (auto sliceOp : sliceOps) { + instShiftMap[sliceOp->getInstruction()] = 0; + } } else { // If a slice wasn't created, the reachable affine_apply op's from its // operands are the ones that go with it. diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp index 0c5581cb455..4101a07a33d 100644 --- a/mlir/lib/Transforms/Utils/Utils.cpp +++ b/mlir/lib/Transforms/Utils/Utils.cpp @@ -170,10 +170,10 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef, return true; } -/// Given an operation instruction, inserts a new single affine apply operation, -/// that is exclusively used by this operation instruction, and that provides -/// all operands that are results of an affine_apply as a function of loop -/// iterators and program parameters and whose results are. +/// Given an operation instruction, inserts one or more single result affine +/// apply operations, results of which are exclusively used by this operation +/// instruction. The operands of these newly created affine apply ops are +/// guaranteed to be loop iterators or terminal symbols of a function. /// /// Before /// @@ -195,10 +195,12 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef, /// /// Returns nullptr either if none of opInst's operands were the result of an /// affine_apply and thus there was no affine computation slice to create, or if -/// all the affine_apply op's supplying operands to this opInst do not have any -/// uses besides this opInst. Returns the new affine_apply operation instruction -/// otherwise. -OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) { +/// all the affine_apply op's supplying operands to this opInst did not have any +/// uses besides this opInst; otherwise returns the list of affine_apply +/// operations created in output argument `sliceOps`. +void mlir::createAffineComputationSlice( + OperationInst *opInst, + SmallVectorImpl<OpPointer<AffineApplyOp>> *sliceOps) { // Collect all operands that are results of affine apply ops. SmallVector<Value *, 4> subOperands; subOperands.reserve(opInst->getNumOperands()); @@ -214,7 +216,7 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) { getReachableAffineApplyOps(subOperands, affineApplyOps); // Skip transforming if there are no affine maps to compose. if (affineApplyOps.empty()) - return nullptr; + return; // Check if all uses of the affine apply op's lie only in this op inst, in // which case there would be nothing to do. @@ -230,19 +232,26 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) { } } if (localized) - return nullptr; + return; FuncBuilder builder(opInst); SmallVector<Value *, 4> composedOpOperands(subOperands); - auto map = builder.getMultiDimIdentityMap(composedOpOperands.size()); - fullyComposeAffineMapAndOperands(&map, &composedOpOperands); - auto affineApply = - builder.create<AffineApplyOp>(opInst->getLoc(), map, composedOpOperands); + auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size()); + fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands); + + // Create an affine_apply for each of the map results. + sliceOps->reserve(composedMap.getNumResults()); + for (auto resultExpr : composedMap.getResults()) { + auto singleResMap = builder.getAffineMap( + composedMap.getNumDims(), composedMap.getNumSymbols(), resultExpr, {}); + sliceOps->push_back(builder.create<AffineApplyOp>( + opInst->getLoc(), singleResMap, composedOpOperands)); + } // Construct the new operands that include the results from the composed // affine apply op above instead of existing ones (subOperands). So, they // differ from opInst's operands only for those operands in 'subOperands', for - // which they will be replaced by the corresponding one from 'results'. + // which they will be replaced by the corresponding one from 'sliceOps'. SmallVector<Value *, 4> newOperands(opInst->getOperands()); for (unsigned i = 0, e = newOperands.size(); i < e; i++) { // Replace the subOperands from among the new operands. @@ -252,15 +261,12 @@ OperationInst *mlir::createAffineComputationSlice(OperationInst *opInst) { break; } if (j < subOperands.size()) { - newOperands[i] = affineApply->getResult(j); + newOperands[i] = (*sliceOps)[j]->getResult(0); } } - for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) { opInst->setOperand(idx, newOperands[idx]); } - - return affineApply->getInstruction(); } /// Folds the specified (lower or upper) bound to a constant if possible diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir index ad420dfb6d6..e303cd532cf 100644 --- a/mlir/test/Transforms/pipeline-data-transfer.mlir +++ b/mlir/test/Transforms/pipeline-data-transfer.mlir @@ -1,41 +1,11 @@ // RUN: mlir-opt %s -pipeline-data-transfer | FileCheck %s -// CHECK-DAG: [[FLOOR_MOD_2_2D:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2, (d0 floordiv 4) mod 2) -// CHECK-DAG: [[FLOOR_MOD_2:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2) // CHECK-DAG: [[MOD_2:#map[0-9]+]] = (d0) -> (d0 mod 2) +// CHECK-DAG: [[FLOOR_MOD_2:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2) // CHECK-DAG: [[REMAP_SHIFT_MINUS_4:#map[0-9]+]] = (d0) -> (d0 - 4) // CHECK-LABEL: func @loop_nest_dma() { func @loop_nest_dma() { -// CHECK: %0 = alloc() : memref<256xf32> -// CHECK: %1 = alloc() : memref<2x32xf32, 1> -// CHECK: %2 = alloc() : memref<2x1xf32> -// CHECK: dma_start %0[%c0], %1[%3#0, %c0], %c128, %2[%3#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> -// CHECK-NEXT: for %i0 = 1 to 8 { -// CHECK-NEXT: %4 = affine_apply #map0(%i0) -// CHECK-NEXT: dma_start %0[%i0], %1[%4#0, %i0], %c128, %2[%4#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> -// CHECK-NEXT: %5 = affine_apply #map1(%i0) -// CHECK-NEXT: %6 = affine_apply #map2(%5) -// CHECK-NEXT: %7 = affine_apply #map2(%5) -// CHECK-NEXT: dma_wait %2[%6, %c0_0], %c128 : memref<2x1xf32> -// CHECK-NEXT: %8 = load %1[%7, %5] : memref<2x32xf32, 1> -// CHECK-NEXT: %9 = "compute"(%8) : (f32) -> f32 -// CHECK-NEXT: store %9, %1[%7, %5] : memref<2x32xf32, 1> -// CHECK-NEXT: for %i1 = 0 to 128 { -// CHECK-NEXT: "do_more_compute"(%5, %i1) : (index, index) -> () -// CHECK-NEXT: } -// CHECK-NEXT: } -// CHECK-NEXT: %10 = affine_apply #map1(%c8) -// CHECK-NEXT: %11 = affine_apply #map2(%10) -// CHECK-NEXT: %12 = affine_apply #map2(%10) -// CHECK-NEXT: dma_wait %2[%11, %c0_0], %c128 : memref<2x1xf32> -// CHECK-NEXT: %13 = load %1[%12, %10] : memref<2x32xf32, 1> -// CHECK-NEXT: %14 = "compute"(%13) : (f32) -> f32 -// CHECK-NEXT: store %14, %1[%12, %10] : memref<2x32xf32, 1> -// CHECK-NEXT: for %i2 = 0 to 128 { -// CHECK-NEXT: "do_more_compute"(%10, %i2) : (index, index) -> () -// CHECK-NEXT: } -// CHECK-NEXT: return %A = alloc() : memref<256 x f32, (d0) -> (d0), 0> %Ah = alloc() : memref<32 x f32, (d0) -> (d0), 1> @@ -57,6 +27,40 @@ func @loop_nest_dma() { } return } +// CHECK: %0 = alloc() : memref<256xf32> +// CHECK: %1 = alloc() : memref<2x32xf32, 1> +// CHECK-NEXT: %2 = alloc() : memref<2x1xf32> +// CHECK-NEXT: %3 = affine_apply [[MOD_2]](%c0) +// CHECK-NEXT: %4 = affine_apply [[MOD_2]](%c0) +// CHECK-NEXT: dma_start %0[%c0], %1[%3, %c0], %c128, %2[%4, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> +// CHECK-NEXT: for %i0 = 1 to 8 { +// CHECK-NEXT: %5 = affine_apply [[MOD_2]](%i0) +// CHECK-NEXT: %6 = affine_apply [[MOD_2]](%i0) +// CHECK-NEXT: dma_start %0[%i0], %1[%5, %i0], %c128, %2[%6, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> +// CHECK-NEXT: %7 = affine_apply #map1(%i0) +// CHECK-NEXT: %8 = affine_apply [[MOD_2]](%7) +// CHECK-NEXT: %9 = affine_apply [[MOD_2]](%7) +// CHECK-NEXT: dma_wait %2[%8, %c0_0], %c128 : memref<2x1xf32> +// CHECK-NEXT: %10 = load %1[%9, %7] : memref<2x32xf32, 1> +// CHECK-NEXT: %11 = "compute"(%10) : (f32) -> f32 +// CHECK-NEXT: store %11, %1[%9, %7] : memref<2x32xf32, 1> +// CHECK-NEXT: for %i1 = 0 to 128 { +// CHECK-NEXT: "do_more_compute"(%7, %i1) : (index, index) -> () +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: %12 = affine_apply #map1(%c8) +// CHECK-NEXT: %13 = affine_apply [[MOD_2]](%12) +// CHECK-NEXT: %14 = affine_apply [[MOD_2]](%12) +// CHECK-NEXT: dma_wait %2[%13, %c0_0], %c128 : memref<2x1xf32> +// CHECK-NEXT: %15 = load %1[%14, %12] : memref<2x32xf32, 1> +// CHECK-NEXT: %16 = "compute"(%15) : (f32) -> f32 +// CHECK-NEXT: store %16, %1[%14, %12] : memref<2x32xf32, 1> +// CHECK-NEXT: for %i2 = 0 to 128 { +// CHECK-NEXT: "do_more_compute"(%12, %i2) : (index, index) -> () +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT:} + // CHECK-LABEL: @loop_step func @loop_step(%arg0: memref<512xf32>, @@ -74,27 +78,29 @@ func @loop_step(%arg0: memref<512xf32>, return } // CHECK: [[TAG:%[0-9]+]] = alloc() : memref<2x1xi32> -// CHECK: %2 = affine_apply [[FLOOR_MOD_2_2D]](%c0) -// CHECK-NEXT: dma_start %arg0[%c0], %0[%2#0, %c0_0], %c4, [[TAG]][%2#1, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> +// CHECK: %2 = affine_apply [[FLOOR_MOD_2]](%c0) +// CHECK: %3 = affine_apply [[FLOOR_MOD_2]](%c0) +// CHECK-NEXT: dma_start %arg0[%c0], %0[%2, %c0_0], %c4, [[TAG]][%3, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> // CHECK-NEXT: for %i0 = 4 to 512 step 4 { -// CHECK-NEXT: %3 = affine_apply [[FLOOR_MOD_2_2D]](%i0) -// CHECK-NEXT: dma_start %arg0[%i0], %0[%3#0, %c0_0], %c4, [[TAG]][%3#1, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> -// CHECK-NEXT: %4 = affine_apply [[REMAP_SHIFT_MINUS_4]](%i0) -// CHECK-NEXT: %5 = affine_apply [[FLOOR_MOD_2]](%4) -// CHECK: dma_wait [[TAG]][%5, %c0_0], %c4 : memref<2x1xi32> -// CHECK-NEXT: "compute"(%4) : (index) -> () +// CHECK-NEXT: %4 = affine_apply [[FLOOR_MOD_2]](%i0) +// CHECK-NEXT: %5 = affine_apply [[FLOOR_MOD_2]](%i0) +// CHECK-NEXT: dma_start %arg0[%i0], %0[%4, %c0_0], %c4, [[TAG]][%5, %c0_0] : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> +// CHECK-NEXT: %6 = affine_apply [[REMAP_SHIFT_MINUS_4]](%i0) +// CHECK-NEXT: %7 = affine_apply [[FLOOR_MOD_2]](%6) +// CHECK: dma_wait [[TAG]][%7, %c0_0], %c4 : memref<2x1xi32> +// CHECK-NEXT: "compute"(%6) : (index) -> () // CHECK-NEXT: } // CHECK-NEXT: [[SHIFTED:%[0-9]+]] = affine_apply [[REMAP_SHIFT_MINUS_4]](%c512) -// CHECK-NEXT: %8 = affine_apply [[FLOOR_MOD_2]]([[SHIFTED]]) -// CHECK: dma_wait [[TAG]][%8, %c0_0], %c4 : memref<2x1xi32> -// CHECK-NEXT: "compute"(%7) : (index) -> () +// CHECK-NEXT: %10 = affine_apply [[FLOOR_MOD_2]]([[SHIFTED]]) +// CHECK: dma_wait [[TAG]][%10, %c0_0], %c4 : memref<2x1xi32> +// CHECK-NEXT: "compute"(%9) : (index) -> () // CHECK-NEXT: return // CHECK-NEXT: } #map0 = (d0, d1) -> (d0, d1) #map1 = (d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32, 0) -#map2 = (d0) -> ((d0 * 2048) floordiv 32, 0) -// CHECK: func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32> +#map2 = (d0) -> ((d0 * 2048) floordiv 32) +// CHECK-LABEL: func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32> func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>, #map0>, %arg1: memref<512x32xvector<8xf32>, #map0>, %arg2: memref<512x32xvector<8xf32>, #map0>) { %num_elts = constant 256 : index %c0 = constant 0 : index @@ -105,58 +111,63 @@ func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>, #map0>, %arg1: memref< %4 = alloc() : memref<2xi32> %5 = alloc() : memref<2xi32> // Prologue for DMA overlap on arg2. + // CHECK:[[TAG_ARG2:%[0-9]+]] = alloc() : memref<2x2xi32> // CHECK: dma_start %arg2[ - // CHECK-NEXT: for %i0 = 1 to 8 { + // CHECK: for %i0 = 1 to 8 { for %i0 = 0 to 8 { %6 = affine_apply #map2(%i0) - dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32> + dma_start %arg2[%6, %c0], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32> dma_wait %5[%c0], %num_elts : memref<2xi32> // Steady state for DMA overlap on arg2 // CHECK: dma_start %arg2[ - // CHECK: dma_wait %1[ + // CHECK: dma_wait [[TAG_ARG2]] // Prologue for DMA overlap on arg0, arg1 nested within i0 + // CHECK: [[TAG_ARG0:%[0-9]+]] = alloc() : memref<2x2xi32> + // CHECK: [[TAG_ARG1:%[0-9]+]] = alloc() : memref<2x2xi32> // CHECK: dma_start %arg0[ // CHECK: dma_start %arg1[ // CHECK-NEXT for %i1 = 1 to 8 { for %i1 = 0 to 8 { %7 = affine_apply #map1(%i0, %i1) %8 = affine_apply #map2(%i1) - dma_start %arg0[%7#0, %7#1], %0[%c0, %c0], %num_elts, %3[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32> - dma_start %arg1[%8#0, %8#1], %1[%c0, %c0], %num_elts, %4[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32> + dma_start %arg0[%7, %c0], %0[%c0, %c0], %num_elts, %3[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32> + dma_start %arg1[%8, %c0], %1[%c0, %c0], %num_elts, %4[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32> dma_wait %3[%c0], %num_elts : memref<2xi32> dma_wait %4[%c0], %num_elts : memref<2xi32> // Steady state for DMA overlap on arg0, arg1 // CHECK: dma_start %arg0[ // CHECK: dma_start %arg1[ - // CHECK: dma_wait %10[ - // CHECK: dma_wait %11[ + // CHECK: dma_wait [[TAG_ARG0]] + // CHECK: dma_wait [[TAG_ARG1]] // CHECK-NEXT: for %i2 = 0 to 4 { for %i2 = 0 to 4 { "foo"() : () -> () } } // epilogue for arg0, arg1 - // CHECK: dma_wait %10[ - // CHECK: dma_wait %11[ - - // epilogue for DMA overlap on %arg2 - // CHECK: dma_wait %1[%31, %c0_2], %c256 : memref<2x2xi32> - // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested. - // CHECK: dma_start %arg0[ - // CHECK: dma_start %arg1[ - // CHECK: for %i4 = 1 to 8 { - // CHECK: dma_start %arg0[ - // CHECK: dma_start %arg1[ - // CHECK: dma_wait %36[ - // CHECK: dma_wait %37[ - // CHECK: for %i5 = 0 to 4 { - // CHECK: "foo"() : () -> () - // CHECK: dma_wait %36[ - // CHECK: dma_wait %37[ - // CHECK: for %i6 = 0 to 4 { - - } // CHECK: } - return // CHECK-NEXT: return + // CHECK: dma_wait [[TAG_ARG0]] + // CHECK: dma_wait [[TAG_ARG1]] + // epilogue for DMA overlap on %arg2 + // CHECK: dma_wait [[TAG_ARG2]] + // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested. + // CHECK: [[TAG_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32> + // CHECK: [[TAG_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32> + // CHECK: dma_start %arg0[ + // CHECK: dma_start %arg1[ + // CHECK: for %i4 = 1 to 8 { + // CHECK: dma_start %arg0[ + // CHECK: dma_start %arg1[ + // CHECK: dma_wait [[TAG_ARG0_NESTED]] + // CHECK: dma_wait [[TAG_ARG1_NESTED]] + // CHECK: for %i5 = 0 to 4 { + // CHECK: "foo"() : () -> () + // CHECK: dma_wait [[TAG_ARG0_NESTED]] + // CHECK: dma_wait [[TAG_ARG1_NESTED]] + // CHECK: for %i6 = 0 to 4 { + } + return +// CHECK: } +// CHECK-NEXT: return } // CHECK: func @loop_dma_dependent @@ -176,10 +187,10 @@ func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) { // CHECK: for %i0 = 0 to 8 { for %i0 = 0 to 8 { %6 = affine_apply #map2(%i0) - dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32> + dma_start %arg2[%6, %c0], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32> dma_wait %5[%c0], %num_elts : memref<2xi32> - dma_start %2[%c0, %c0], %arg2[%6#0, %6#1], %num_elts, %5[%c0] : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32> + dma_start %2[%c0, %c0], %arg2[%6, %c0], %num_elts, %5[%c0] : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32> dma_wait %5[%c0], %num_elts : memref<2xi32> } // CHECK: } return // CHECK-NEXT: return @@ -246,9 +257,9 @@ func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) { // CHECK-NEXT: %1 = dim %0, 0 : memref<?x?xf32, 2> // CHECK-NEXT: %2 = dim %0, 1 : memref<?x?xf32, 2> // CHECK-NEXT: %3 = alloc(%1, %2) : memref<2x?x?xf32, 2> - -// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%5#0, %c0_0, %c0_0], -// CHECK-NEXT: for %i0 = 1 to 16 { +// CHECK: %5 = affine_apply [[MOD_2]](%c0) +// CHECK: %6 = affine_apply [[MOD_2]](%c0) +// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%5, %c0_0, %c0_0], %c512, %4[%6, %c0_0] for %kTT = 0 to 16 { dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %num_elt, %tag[%zero] : memref<512 x 32 x f32>, @@ -256,9 +267,12 @@ func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) { dma_wait %tag[%zero], %num_elt : memref<1 x i32> } return -// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%6#0, %c0_0, %c0_0], %c512, %4[%6#1, %c0_0] -// CHECK: dma_wait %4[%8, %c0_0], %c512 : memref<2x1xi32> +// CHECK-NEXT: for %i0 = 1 to 16 { +// CHECK: %7 = affine_apply [[MOD_2]](%i0) +// CHECK: %8 = affine_apply [[MOD_2]](%i0) +// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%7, %c0_0, %c0_0], %c512, %4[%8, %c0_0] +// CHECK: dma_wait %4[%10, %c0_0], %c512 : memref<2x1xi32> // CHECK: } -// CHECK: dma_wait %4[%11, %c0_0], %c512 : memref<2x1xi32> +// CHECK: dma_wait %4[%13, %c0_0], %c512 : memref<2x1xi32> // CHECK-NEXT: return } |

