diff options
Diffstat (limited to 'mlir/lib')
| -rw-r--r-- | mlir/lib/AffineOps/AffineOps.cpp | 2 | ||||
| -rw-r--r-- | mlir/lib/Analysis/AffineAnalysis.cpp | 8 | ||||
| -rw-r--r-- | mlir/lib/Analysis/Utils.cpp | 12 | ||||
| -rw-r--r-- | mlir/lib/IR/Block.cpp | 2 | ||||
| -rw-r--r-- | mlir/lib/Transforms/DmaGeneration.cpp | 19 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LoopFusion.cpp | 3 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LoopTiling.cpp | 3 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LoopUnroll.cpp | 2 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LoopUnrollAndJam.cpp | 2 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LowerAffine.cpp | 8 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LowerVectorTransfers.cpp | 10 | ||||
| -rw-r--r-- | mlir/lib/Transforms/MaterializeVectors.cpp | 24 | ||||
| -rw-r--r-- | mlir/lib/Transforms/MemRefDataFlowOpt.cpp | 8 | ||||
| -rw-r--r-- | mlir/lib/Transforms/PipelineDataTransfer.cpp | 18 | ||||
| -rw-r--r-- | mlir/lib/Transforms/Utils/LoopUtils.cpp | 14 | ||||
| -rw-r--r-- | mlir/lib/Transforms/Utils/Utils.cpp | 4 | ||||
| -rw-r--r-- | mlir/lib/Transforms/Vectorize.cpp | 50 | 
17 files changed, 97 insertions, 92 deletions
diff --git a/mlir/lib/AffineOps/AffineOps.cpp b/mlir/lib/AffineOps/AffineOps.cpp index 249b09f41cd..be5a2f14628 100644 --- a/mlir/lib/AffineOps/AffineOps.cpp +++ b/mlir/lib/AffineOps/AffineOps.cpp @@ -716,7 +716,7 @@ static void printBound(AffineBound bound, const char *prefix, OpAsmPrinter *p) {  }  void AffineForOp::print(OpAsmPrinter *p) const { -  *p << "for "; +  *p << "affine.for ";    p->printOperand(getBody()->getArgument(0));    *p << " = ";    printBound(getLowerBound(), "max", p); diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp index 9d2ea691bdd..3a086ba512d 100644 --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -756,8 +756,8 @@ void MemRefAccess::getAccessMap(AffineValueMap *accessMap) const {  // For example, given the following MLIR code with with "source" and  // "destination" accesses to the same memref labled, and symbols %M, %N, %K:  // -//   for %i0 = 0 to 100 { -//     for %i1 = 0 to 50 { +//   affine.for %i0 = 0 to 100 { +//     affine.for %i1 = 0 to 50 {  //       %a0 = affine.apply  //         (d0, d1) -> (d0 * 2 - d1 * 4 + s1, d1 * 3 - s0) (%i0, %i1)[%M, %N]  //       // Source memref access. @@ -765,8 +765,8 @@ void MemRefAccess::getAccessMap(AffineValueMap *accessMap) const {  //     }  //   }  // -//   for %i2 = 0 to 100 { -//     for %i3 = 0 to 50 { +//   affine.for %i2 = 0 to 100 { +//     affine.for %i3 = 0 to 50 {  //       %a1 = affine.apply  //         (d0, d1) -> (d0 * 7 + d1 * 9 - s1, d1 * 11 + s0) (%i2, %i3)[%K, %M]  //       // Destination memref access. diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index ae48e644a68..a48f39c2aac 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -36,13 +36,13 @@  using namespace mlir;  /// Populates 'loops' with IVs of the loops surrounding 'inst' ordered from -/// the outermost 'for' instruction to the innermost one. +/// the outermost 'affine.for' instruction to the innermost one.  void mlir::getLoopIVs(const Instruction &inst,                        SmallVectorImpl<OpPointer<AffineForOp>> *loops) {    auto *currInst = inst.getParentInst();    OpPointer<AffineForOp> currAffineForOp; -  // Traverse up the hierarchy collecing all 'for' instruction while skipping -  // over 'if' instructions. +  // Traverse up the hierarchy collecing all 'affine.for' instruction while +  // skipping over 'if' instructions.    while (currInst && ((currAffineForOp = currInst->dyn_cast<AffineForOp>()) ||                        currInst->isa<AffineIfOp>())) {      if (currAffineForOp) @@ -111,8 +111,8 @@ bool MemRefRegion::unionBoundingBox(const MemRefRegion &other) {  //  For example, the memref region for this load operation at loopDepth = 1 will  //  be as below:  // -//    for %i = 0 to 32 { -//      for %ii = %i to (d0) -> (d0 + 8) (%i) { +//    affine.for %i = 0 to 32 { +//      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {  //        load %A[%ii]  //      }  //    } @@ -614,7 +614,7 @@ Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block,                                                  int memorySpace) {    std::vector<std::unique_ptr<MemRefRegion>> regions; -  // Walk this 'for' instruction to gather all memory regions. +  // Walk this 'affine.for' instruction to gather all memory regions.    bool error = false;    const_cast<Block *>(&block)->walk([&](Instruction *opInst) {      if (!opInst->isa<LoadOp>() && !opInst->isa<StoreOp>()) { diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp index e0c76e9efad..83e15097942 100644 --- a/mlir/lib/IR/Block.cpp +++ b/mlir/lib/IR/Block.cpp @@ -189,7 +189,7 @@ unsigned Block::getNumSuccessors() const {      return terminator->getNumSuccessors();    }    assert(getParent() && "top-level block with no terminator"); -  // Blocks inside 'for'/'if' instructions don't have successors. +  // Blocks inside 'affine.for'/'if' instructions don't have successors.    return 0;  } diff --git a/mlir/lib/Transforms/DmaGeneration.cpp b/mlir/lib/Transforms/DmaGeneration.cpp index 855ff37f60f..631ebf939ea 100644 --- a/mlir/lib/Transforms/DmaGeneration.cpp +++ b/mlir/lib/Transforms/DmaGeneration.cpp @@ -338,7 +338,8 @@ bool DmaGeneration::generateDma(const MemRefRegion ®ion, Block *block,      auto fastMemRefType = top.getMemRefType(          fastBufferShape, memRefType.getElementType(), {}, fastMemorySpace); -    // Create the fast memory space buffer just before the 'for' instruction. +    // Create the fast memory space buffer just before the 'affine.for' +    // instruction.      fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType)->getResult();      // Record it.      fastBufferMap[memref] = fastMemRef; @@ -456,7 +457,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {    // approach is conservative in some cases at the moment, we do a check later    // and report an error with location info.    // TODO(bondhugula): An 'if' instruction is being treated similar to an -  // operation instruction. 'if''s could have 'for's in them; treat them +  // operation instruction. 'if''s could have 'affine.for's in them; treat them    // separately.    // Get to the first load, store, or for op. @@ -470,9 +471,9 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {      if (auto forOp = it->dyn_cast<AffineForOp>()) {        // We'll assume for now that loops with steps are tiled loops, and so DMAs        // are not performed for that depth, but only further inside. -      // If the memory footprint of the 'for' loop is higher than fast memory -      // capacity (when provided), we recurse to DMA at an inner level until -      // we find a depth at which footprint fits in the capacity. If the +      // If the memory footprint of the 'affine.for' loop is higher than fast +      // memory capacity (when provided), we recurse to DMA at an inner level +      // until we find a depth at which footprint fits in the capacity. If the        // footprint can't be calcuated, we assume for now it fits.        // Returns true if the footprint is known to exceed capacity. @@ -489,13 +490,13 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {          consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);          // Recurse onto the body of this loop.          runOnBlock(forOp->getBody(), consumedCapacityBytes); -        // The next region starts right after the 'for' instruction. +        // The next region starts right after the 'affine.for' instruction.          curBegin = std::next(it);        } else {          // We have enough capacity, i.e., DMAs will be computed for the portion -        // of the block until 'it', and for the 'for' loop. For the latter, they -        // are placed just before this loop (for incoming DMAs) and right after -        // (for outgoing ones). +        // of the block until 'it', and for the 'affine.for' loop. For the +        // latter, they are placed just before this loop (for incoming DMAs) and +        // right after (for outgoing ones).          consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);          // Inner loop DMAs have their own scope - we don't thus update consumed diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index 8d5f51059bf..9e96b0800b3 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -510,7 +510,8 @@ bool MemRefDependenceGraph::init(Function *f) {        // all loads and store accesses it contains.        LoopNestStateCollector collector;        collector.collect(&inst); -      // Return false if a non 'for' region was found (not currently supported). +      // Return false if a non 'affine.for' region was found (not currently +      // supported).        if (collector.hasNonForRegion)          return false;        Node node(nextNodeId++, &inst); diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp index 368a1dac1df..f00c2e767e6 100644 --- a/mlir/lib/Transforms/LoopTiling.cpp +++ b/mlir/lib/Transforms/LoopTiling.cpp @@ -231,7 +231,8 @@ UtilResult mlir::tileCodeGen(MutableArrayRef<OpPointer<AffineForOp>> band,  static void  getTileableBands(Function *f,                   std::vector<SmallVector<OpPointer<AffineForOp>, 6>> *bands) { -  // Get maximal perfect nest of 'for' insts starting from root (inclusive). +  // Get maximal perfect nest of 'affine.for' insts starting from root +  // (inclusive).    auto getMaximalPerfectLoopNest = [&](OpPointer<AffineForOp> root) {      SmallVector<OpPointer<AffineForOp>, 6> band;      OpPointer<AffineForOp> currInst = root; diff --git a/mlir/lib/Transforms/LoopUnroll.cpp b/mlir/lib/Transforms/LoopUnroll.cpp index 3a7cfb85e08..025a86891df 100644 --- a/mlir/lib/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Transforms/LoopUnroll.cpp @@ -164,7 +164,7 @@ PassResult LoopUnroll::runOnFunction(Function *f) {    return success();  } -/// Unrolls a 'for' inst. Returns true if the loop was unrolled, false +/// Unrolls a 'affine.for' inst. Returns true if the loop was unrolled, false  /// otherwise. The default unroll factor is 4.  bool LoopUnroll::runOnAffineForOp(OpPointer<AffineForOp> forOp) {    // Use the function callback if one was provided. diff --git a/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Transforms/LoopUnrollAndJam.cpp index b2aed7d9d7f..2f0249824dd 100644 --- a/mlir/lib/Transforms/LoopUnrollAndJam.cpp +++ b/mlir/lib/Transforms/LoopUnrollAndJam.cpp @@ -105,7 +105,7 @@ PassResult LoopUnrollAndJam::runOnFunction(Function *f) {    return success();  } -/// Unroll and jam a 'for' inst. Default unroll jam factor is +/// Unroll and jam a 'affine.for' inst. Default unroll jam factor is  /// kDefaultUnrollJamFactor. Return false if nothing was done.  bool LoopUnrollAndJam::runOnAffineForOp(OpPointer<AffineForOp> forOp) {    // Unroll and jam by the factor that was passed if any. diff --git a/mlir/lib/Transforms/LowerAffine.cpp b/mlir/lib/Transforms/LowerAffine.cpp index 0d8eb8a4761..ef45891c26f 100644 --- a/mlir/lib/Transforms/LowerAffine.cpp +++ b/mlir/lib/Transforms/LowerAffine.cpp @@ -283,7 +283,8 @@ static Value *buildMinMaxReductionSeq(Location loc, CmpIPredicate predicate,    return value;  } -// Convert a "for" loop to a flow of blocks.  Return `false` on success. +// Convert a "affine.for" loop to a flow of blocks.  Return `false` on +// success.  //  // Create an SESE region for the loop (including its body) and append it to the  // end of the current region.  The loop region consists of the initialization @@ -330,8 +331,9 @@ bool LowerAffinePass::lowerAffineFor(OpPointer<AffineForOp> forOp) {    auto loc = forOp->getLoc();    auto *forInst = forOp->getInstruction(); -  // Start by splitting the block containing the 'for' into two parts.  The part -  // before will get the init code, the part after will be the end point. +  // Start by splitting the block containing the 'affine.for' into two parts. +  // The part before will get the init code, the part after will be the end +  // point.    auto *initBlock = forInst->getBlock();    auto *endBlock = initBlock->splitBlock(forInst); diff --git a/mlir/lib/Transforms/LowerVectorTransfers.cpp b/mlir/lib/Transforms/LowerVectorTransfers.cpp index 63fb45db9c5..e63d3c8111c 100644 --- a/mlir/lib/Transforms/LowerVectorTransfers.cpp +++ b/mlir/lib/Transforms/LowerVectorTransfers.cpp @@ -126,9 +126,9 @@ private:  ///    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into  ///    // vector<32x256xf32> and pad with %f0 to handle the boundary case:  ///    %f0 = constant 0.0f : f32 -///    for %i0 = 0 to %0 { -///      for %i1 = 0 to %1 step 256 { -///        for %i2 = 0 to %2 step 32 { +///    affine.for %i0 = 0 to %0 { +///      affine.for %i1 = 0 to %1 step 256 { +///        affine.for %i2 = 0 to %2 step 32 {  ///          %v = vector_transfer_read %A, %i0, %i1, %i2, %f0  ///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :  ///               (memref<?x?x?xf32>, index, index, f32) -> vector<32x256xf32> @@ -139,8 +139,8 @@ private:  /// MLIR resembling:  ///  /// ```mlir -///    for %d1 = 0 to 256 { -///      for %d2 = 0 to 32 { +///    affine.for %d1 = 0 to 256 { +///      affine.for %d2 = 0 to 32 {  ///        %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32  ///        %tmp[%d2, %d1] = %s  ///      } diff --git a/mlir/lib/Transforms/MaterializeVectors.cpp b/mlir/lib/Transforms/MaterializeVectors.cpp index be5a03bc416..4434ab5322e 100644 --- a/mlir/lib/Transforms/MaterializeVectors.cpp +++ b/mlir/lib/Transforms/MaterializeVectors.cpp @@ -101,10 +101,10 @@  ///    mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {  ///      %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>  ///      %f1 = constant splat<vector<4x4x4xf32>, 1.000000e+00> : -///      vector<4x4x4xf32> for %i0 = 0 to %M step 4 { -///        for %i1 = 0 to %N step 4 { -///          for %i2 = 0 to %O { -///            for %i3 = 0 to %P step 4 { +///      vector<4x4x4xf32> affine.for %i0 = 0 to %M step 4 { +///        affine.for %i1 = 0 to %N step 4 { +///          affine.for %i2 = 0 to %O { +///            affine.for %i3 = 0 to %P step 4 {  ///              vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3  ///                {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} :  ///                 vector<4x4x4xf32>, memref<?x?x?x?xf32, 0>, @@ -120,10 +120,10 @@  ///    mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {  ///      %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>  ///      %f1 = constant splat<vector<4x4xf32>, 1.000000e+00> : vector<4x4x4xf32> -///       for %i0 = 0 to %arg0 step 4 { -///         for %i1 = 0 to %arg1 step 4 { -///           for %i2 = 0 to %arg2 { -///             for %i3 = 0 to %arg3 step 4 { +///       affine.for %i0 = 0 to %arg0 step 4 { +///         affine.for %i1 = 0 to %arg1 step 4 { +///           affine.for %i2 = 0 to %arg2 { +///             affine.for %i3 = 0 to %arg3 step 4 {  ///               %1 = affine.apply (d0, d1, d2, d3) -> (d0, d1, d2, d3)  ///                    (%i0, %i1, %i2, %i3)  ///               vector_transfer_write f1, %0, %1#0, %1#1, %1#2, %1#3 @@ -293,10 +293,10 @@ static Value *substitute(Value *v, VectorType hwVectorType,  /// super-vectorization has been applied:  ///  /// ```mlir -/// for %i0 = 0 to %M { -///   for %i1 = 0 to %N step 3 { -///     for %i2 = 0 to %O { -///       for %i3 = 0 to %P step 32 { +/// affine.for %i0 = 0 to %M { +///   affine.for %i1 = 0 to %N step 3 { +///     affine.for %i2 = 0 to %O { +///       affine.for %i3 = 0 to %P step 32 {  ///         %r = vector_transfer_read(%A, map(%i..)#0, map(%i..)#1, map(%i..)#2)  ///                                   -> vector<3x32xf32>  ///         ... diff --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp index d9f940a01f3..3141d748750 100644 --- a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp +++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp @@ -19,7 +19,7 @@  // potentially getting rid of intermediate memref's entirely.  // TODO(mlir-team): In the future, similar techniques could be used to eliminate  // dead memref store's and perform more complex forwarding when support for -// SSA scalars live out of 'for'/'if' statements is available. +// SSA scalars live out of 'affine.for'/'if' statements is available.  //===----------------------------------------------------------------------===//  #include "mlir/Analysis/AffineAnalysis.h" @@ -55,7 +55,7 @@ namespace {  //  // (* A dependence being satisfied at a block: a dependence that is satisfied by  // virtue of the destination instruction appearing textually / lexically after -// the source instruction within the body of a 'for' instruction; thus, a +// the source instruction within the body of a 'affine.for' instruction; thus, a  // dependence is always either satisfied by a loop or by a block).  //  // The above conditions are simple to check, sufficient, and powerful for most @@ -145,8 +145,8 @@ void MemRefDataFlowOpt::forwardStoreToLoad(OpPointer<LoadOp> loadOp) {        // Check if this store is a candidate for forwarding; we only forward if        // the dependence from the store is carried by the *body* of innermost        // common surrounding loop. As an example this filters out cases like: -      // for %i0 -      //   for %i1 +      // affine.for %i0 +      //   affine.for %i1        //     %idx = affine.apply (d0) -> (d0 + 1) (%i0)        //     store %A[%idx]        //     load %A[%i0] diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index cfa045f2279..84c8cd830dc 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -71,11 +71,11 @@ static unsigned getTagMemRefPos(const Instruction &dmaInst) {    return 0;  } -/// Doubles the buffer of the supplied memref on the specified 'for' instruction -/// by adding a leading dimension of size two to the memref. Replaces all uses -/// of the old memref by the new one while indexing the newly added dimension by -/// the loop IV of the specified 'for' instruction modulo 2. Returns false if -/// such a replacement cannot be performed. +/// Doubles the buffer of the supplied memref on the specified 'affine.for' +/// instruction by adding a leading dimension of size two to the memref. +/// Replaces all uses of the old memref by the new one while indexing the newly +/// added dimension by the loop IV of the specified 'affine.for' instruction +/// modulo 2. Returns false if such a replacement cannot be performed.  static bool doubleBuffer(Value *oldMemRef, OpPointer<AffineForOp> forOp) {    auto *forBody = forOp->getBody();    FuncBuilder bInner(forBody, forBody->begin()); @@ -108,7 +108,7 @@ static bool doubleBuffer(Value *oldMemRef, OpPointer<AffineForOp> forOp) {                                                     dynamicDimCount++));    } -  // Create and place the alloc right before the 'for' instruction. +  // Create and place the alloc right before the 'affine.for' instruction.    // TODO(mlir-team): we are assuming scoped allocation here, and aren't    // inserting a dealloc -- this isn't the right thing.    Value *newMemRef = @@ -137,9 +137,9 @@ static bool doubleBuffer(Value *oldMemRef, OpPointer<AffineForOp> forOp) {  /// Returns success if the IR is in a valid state.  PassResult PipelineDataTransfer::runOnFunction(Function *f) {    // Do a post order walk so that inner loop DMAs are processed first. This is -  // necessary since 'for' instructions nested within would otherwise become -  // invalid (erased) when the outer loop is pipelined (the pipelined one gets -  // deleted and replaced by a prologue, a new steady-state loop and an +  // necessary since 'affine.for' instructions nested within would otherwise +  // become invalid (erased) when the outer loop is pipelined (the pipelined one +  // gets deleted and replaced by a prologue, a new steady-state loop and an    // epilogue).    forOps.clear();    f->walkPostOrder<AffineForOp>( diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index a1903ace026..110949f43d5 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -138,8 +138,8 @@ void mlir::promoteSingleIterationLoops(Function *f) {        [](OpPointer<AffineForOp> forOp) { promoteIfSingleIteration(forOp); });  } -/// Generates a 'for' inst with the specified lower and upper bounds while -/// generating the right IV remappings for the shifted instructions. The +/// Generates a 'affine.for' inst with the specified lower and upper bounds +/// while generating the right IV remappings for the shifted instructions. The  /// instruction blocks that go into the loop are specified in instGroupQueue  /// starting from the specified offset, and in that order; the first element of  /// the pair specifies the shift applied to that group of instructions; note @@ -194,10 +194,10 @@ generateLoop(AffineMap lbMap, AffineMap ubMap,    return loopChunk;  } -/// Skew the instructions in the body of a 'for' instruction with the specified -/// instruction-wise shifts. The shifts are with respect to the original -/// execution order, and are multiplied by the loop 'step' before being applied. -/// A shift of zero for each instruction will lead to no change. +/// Skew the instructions in the body of a 'affine.for' instruction with the +/// specified instruction-wise shifts. The shifts are with respect to the +/// original execution order, and are multiplied by the loop 'step' before being +/// applied. A shift of zero for each instruction will lead to no change.  // The skewing of instructions with respect to one another can be used for  // example to allow overlap of asynchronous operations (such as DMA  // communication) with computation, or just relative shifting of instructions @@ -246,7 +246,7 @@ UtilResult mlir::instBodySkew(OpPointer<AffineForOp> forOp,    // An array of instruction groups sorted by shift amount; each group has all    // instructions with the same shift in the order in which they appear in the -  // body of the 'for' inst. +  // body of the 'affine.for' inst.    std::vector<std::vector<Instruction *>> sortedInstGroups(maxShift + 1);    unsigned pos = 0;    for (auto &inst : *forOp->getBody()) { diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp index 41689be52fc..90d28bf34df 100644 --- a/mlir/lib/Transforms/Utils/Utils.cpp +++ b/mlir/lib/Transforms/Utils/Utils.cpp @@ -194,14 +194,14 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,  ///  /// Before  /// -/// for %i = 0 to #map(%N) +/// affine.for %i = 0 to #map(%N)  ///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)  ///   "send"(%idx, %A, ...)  ///   "compute"(%idx)  ///  /// After  /// -/// for %i = 0 to #map(%N) +/// affine.for %i = 0 to #map(%N)  ///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)  ///   "send"(%idx, %A, ...)  ///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i) diff --git a/mlir/lib/Transforms/Vectorize.cpp b/mlir/lib/Transforms/Vectorize.cpp index 5a8d5d24661..1f4c7b9fcc8 100644 --- a/mlir/lib/Transforms/Vectorize.cpp +++ b/mlir/lib/Transforms/Vectorize.cpp @@ -113,7 +113,7 @@ using namespace mlir;  ///  /// At a high level, a vectorized load in a loop will resemble:  /// ```mlir -///   for %i = ? to ? step ? { +///   affine.for %i = ? to ? step ? {  ///     %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) ->  ///                                              vector<128xf32>  ///   } @@ -309,7 +309,7 @@ using namespace mlir;  /// ```mlir  /// mlfunc @fill(%A : memref<128xf32>) -> () {  ///   %f1 = constant 1.0 : f32 -///   for %i0 = 0 to 32 { +///   affine.for %i0 = 0 to 32 {  ///     store %f1, %A[%i0] : memref<128xf32, 0>  ///   }  ///   return @@ -322,7 +322,7 @@ using namespace mlir;  /// is still subject to exploratory tradeoffs. In particular, say we want to  /// vectorize by a factor 128, we want to transform the following input:  /// ```mlir -///   for %i = %M to %N { +///   affine.for %i = %M to %N {  ///     %a = load A[%i] : memref<?xf32>  ///   }  /// ``` @@ -331,8 +331,8 @@ using namespace mlir;  /// memory promotion etc) say after stripmining (and potentially unrolling in  /// the case of LLVM's SLP vectorizer):  /// ```mlir -///   for %i = floor(%M, 128) to ceil(%N, 128) { -///     for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) { +///   affine.for %i = floor(%M, 128) to ceil(%N, 128) { +///     affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {  ///       %a = load A[%ii] : memref<?xf32>  ///     }  ///   } @@ -341,7 +341,7 @@ using namespace mlir;  /// Instead, we seek to vectorize early and freeze vector types before  /// scheduling, so we want to generate a pattern that resembles:  /// ```mlir -///   for %i = ? to ? step ? { +///   affine.for %i = ? to ? step ? {  ///     %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) ->  ///                                              vector<128xf32>  ///   } @@ -362,7 +362,7 @@ using namespace mlir;  /// For the simple strawman example above, vectorizing for a 1-D vector  /// abstraction of size 128 returns code similar to:  /// ```mlir -///   for %i = %M to %N step 128 { +///   affine.for %i = %M to %N step 128 {  ///     %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) ->  ///                                              vector<128xf32>  ///   } @@ -391,20 +391,20 @@ using namespace mlir;  ///   %C = alloc (%M, %N) : memref<?x?xf32, 0>  ///   %f1 = constant 1.0 : f32  ///   %f2 = constant 2.0 : f32 -///   for %i0 = 0 to %M { -///     for %i1 = 0 to %N { +///   affine.for %i0 = 0 to %M { +///     affine.for %i1 = 0 to %N {  ///       // non-scoped %f1  ///       store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>  ///     }  ///   } -///   for %i2 = 0 to %M { -///     for %i3 = 0 to %N { +///   affine.for %i2 = 0 to %M { +///     affine.for %i3 = 0 to %N {  ///       // non-scoped %f2  ///       store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>  ///     }  ///   } -///   for %i4 = 0 to %M { -///     for %i5 = 0 to %N { +///   affine.for %i4 = 0 to %M { +///     affine.for %i5 = 0 to %N {  ///       %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>  ///       %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>  ///       %s5 = addf %a5, %b5 : f32 @@ -438,24 +438,24 @@ using namespace mlir;  ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>  ///   %cst = constant 1.0 : f32  ///   %cst_0 = constant 2.0 : f32 -///   for %i0 = 0 to %arg0 { -///     for %i1 = 0 to %arg1 step 256 { +///   affine.for %i0 = 0 to %arg0 { +///     affine.for %i1 = 0 to %arg1 step 256 {  ///       %cst_1 = constant splat<vector<256xf32>, 1.0> :  ///                vector<256xf32>  ///       "vector_transfer_write"(%cst_1, %0, %i0, %i1) :  ///                (vector<256xf32>, memref<?x?xf32>, index, index) -> ()  ///     }  ///   } -///   for %i2 = 0 to %arg0 { -///     for %i3 = 0 to %arg1 step 256 { +///   affine.for %i2 = 0 to %arg0 { +///     affine.for %i3 = 0 to %arg1 step 256 {  ///       %cst_2 = constant splat<vector<256xf32>, 2.0> :  ///                vector<256xf32>  ///       "vector_transfer_write"(%cst_2, %1, %i2, %i3) :  ///                (vector<256xf32>, memref<?x?xf32>, index, index) -> ()  ///     }  ///   } -///   for %i4 = 0 to %arg0 { -///     for %i5 = 0 to %arg1 step 256 { +///   affine.for %i4 = 0 to %arg0 { +///     affine.for %i5 = 0 to %arg1 step 256 {  ///       %3 = "vector_transfer_read"(%0, %i4, %i5) :  ///                      (memref<?x?xf32>, index, index) -> vector<256xf32>  ///       %4 = "vector_transfer_read"(%1, %i4, %i5) : @@ -494,24 +494,24 @@ using namespace mlir;  ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>  ///   %cst = constant 1.0 : f32  ///   %cst_0 = constant 2.0 : f32 -///   for %i0 = 0 to %arg0 step 32 { -///     for %i1 = 0 to %arg1 step 256 { +///   affine.for %i0 = 0 to %arg0 step 32 { +///     affine.for %i1 = 0 to %arg1 step 256 {  ///       %cst_1 = constant splat<vector<32x256xf32>, 1.0> :  ///                vector<32x256xf32>  ///       "vector_transfer_write"(%cst_1, %0, %i0, %i1) :  ///                (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()  ///     }  ///   } -///   for %i2 = 0 to %arg0 step 32 { -///     for %i3 = 0 to %arg1 step 256 { +///   affine.for %i2 = 0 to %arg0 step 32 { +///     affine.for %i3 = 0 to %arg1 step 256 {  ///       %cst_2 = constant splat<vector<32x256xf32>, 2.0> :  ///                vector<32x256xf32>  ///       "vector_transfer_write"(%cst_2, %1, %i2, %i3) :  ///                (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()  ///     }  ///   } -///   for %i4 = 0 to %arg0 step 32 { -///     for %i5 = 0 to %arg1 step 256 { +///   affine.for %i4 = 0 to %arg0 step 32 { +///     affine.for %i5 = 0 to %arg1 step 256 {  ///       %3 = "vector_transfer_read"(%0, %i4, %i5) :  ///                (memref<?x?xf32>, index, index) -> vector<32x256xf32>  ///       %4 = "vector_transfer_read"(%1, %i4, %i5) :  | 

