13 files changed, 76 insertions, 80 deletions
diff --git a/mlir/lib/Transforms/DmaGeneration.cpp b/mlir/lib/Transforms/DmaGeneration.cpp
index dcb4828d0bf..bda98f46b61 100644
--- a/mlir/lib/Transforms/DmaGeneration.cpp
+++ b/mlir/lib/Transforms/DmaGeneration.cpp
@@ -338,7 +338,7 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, Block *block,
     auto fastMemRefType = top.getMemRefType(
         fastBufferShape, memRefType.getElementType(), {}, fastMemorySpace);
 
-    // Create the fast memory space buffer just before the 'affine.for'
+    // Create the fast memory space buffer just before the 'for'
     // instruction.
     fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType)->getResult();
     // Record it.
@@ -457,7 +457,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
   // approach is conservative in some cases at the moment, we do a check later
   // and report an error with location info.
   // TODO(bondhugula): An 'affine.if' instruction is being treated similar to an
-  // operation instruction. 'affine.if''s could have 'affine.for's in them;
+  // operation instruction. 'affine.if''s could have 'for's in them;
   // treat them separately.
 
   // Get to the first load, store, or for op.
@@ -471,7 +471,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
     if (auto forOp = it->dyn_cast<AffineForOp>()) {
       // We'll assume for now that loops with steps are tiled loops, and so DMAs
       // are not performed for that depth, but only further inside.
-      // If the memory footprint of the 'affine.for' loop is higher than fast
+      // If the memory footprint of the 'for' loop is higher than fast
       // memory capacity (when provided), we recurse to DMA at an inner level
       // until we find a depth at which footprint fits in the capacity. If the
       // footprint can't be calcuated, we assume for now it fits.
@@ -490,11 +490,11 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
         consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
         // Recurse onto the body of this loop.
         runOnBlock(forOp->getBody(), consumedCapacityBytes);
-        // The next region starts right after the 'affine.for' instruction.
+        // The next region starts right after the 'for' instruction.
         curBegin = std::next(it);
       } else {
         // We have enough capacity, i.e., DMAs will be computed for the portion
-        // of the block until 'it', and for the 'affine.for' loop. For the
+        // of the block until 'it', and for the 'for' loop. For the
         // latter, they are placed just before this loop (for incoming DMAs) and
         // right after (for outgoing ones).
         consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
index 9e96b0800b3..8d5f51059bf 100644
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -510,8 +510,7 @@ bool MemRefDependenceGraph::init(Function *f) {
       // all loads and store accesses it contains.
       LoopNestStateCollector collector;
       collector.collect(&inst);
-      // Return false if a non 'affine.for' region was found (not currently
-      // supported).
+      // Return false if a non 'for' region was found (not currently supported).
       if (collector.hasNonForRegion)
         return false;
       Node node(nextNodeId++, &inst);
diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp
index f00c2e767e6..368a1dac1df 100644
--- a/mlir/lib/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Transforms/LoopTiling.cpp
@@ -231,8 +231,7 @@ UtilResult mlir::tileCodeGen(MutableArrayRef<OpPointer<AffineForOp>> band,
 static void
 getTileableBands(Function *f,
                  std::vector<SmallVector<OpPointer<AffineForOp>, 6>> *bands) {
-  // Get maximal perfect nest of 'affine.for' insts starting from root
-  // (inclusive).
+  // Get maximal perfect nest of 'for' insts starting from root (inclusive).
   auto getMaximalPerfectLoopNest = [&](OpPointer<AffineForOp> root) {
     SmallVector<OpPointer<AffineForOp>, 6> band;
     OpPointer<AffineForOp> currInst = root;
diff --git a/mlir/lib/Transforms/LoopUnroll.cpp b/mlir/lib/Transforms/LoopUnroll.cpp
index 025a86891df..3a7cfb85e08 100644
--- a/mlir/lib/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Transforms/LoopUnroll.cpp
@@ -164,7 +164,7 @@ PassResult LoopUnroll::runOnFunction(Function *f) {
   return success();
 }
 
-/// Unrolls a 'affine.for' inst. Returns true if the loop was unrolled, false
+/// Unrolls a 'for' inst. Returns true if the loop was unrolled, false
 /// otherwise. The default unroll factor is 4.
 bool LoopUnroll::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
   // Use the function callback if one was provided.
diff --git a/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Transforms/LoopUnrollAndJam.cpp
index 2f0249824dd..b2aed7d9d7f 100644
--- a/mlir/lib/Transforms/LoopUnrollAndJam.cpp
+++ b/mlir/lib/Transforms/LoopUnrollAndJam.cpp
@@ -105,7 +105,7 @@ PassResult LoopUnrollAndJam::runOnFunction(Function *f) {
   return success();
 }
 
-/// Unroll and jam a 'affine.for' inst. Default unroll jam factor is
+/// Unroll and jam a 'for' inst. Default unroll jam factor is
 /// kDefaultUnrollJamFactor. Return false if nothing was done.
 bool LoopUnrollAndJam::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
   // Unroll and jam by the factor that was passed if any.
diff --git a/mlir/lib/Transforms/LowerAffine.cpp b/mlir/lib/Transforms/LowerAffine.cpp
index 5ce8a6258f4..ef6ff420912 100644
--- a/mlir/lib/Transforms/LowerAffine.cpp
+++ b/mlir/lib/Transforms/LowerAffine.cpp
@@ -283,8 +283,7 @@ static Value *buildMinMaxReductionSeq(Location loc, CmpIPredicate predicate,
   return value;
 }
 
-// Convert a "affine.for" loop to a flow of blocks.  Return `false` on
-// success.
+// Convert a "for" loop to a flow of blocks.  Return `false` on success.
 //
 // Create an SESE region for the loop (including its body) and append it to the
 // end of the current region.  The loop region consists of the initialization
@@ -331,9 +330,8 @@ bool LowerAffinePass::lowerAffineFor(OpPointer<AffineForOp> forOp) {
   auto loc = forOp->getLoc();
   auto *forInst = forOp->getInstruction();
 
-  // Start by splitting the block containing the 'affine.for' into two parts.
-  // The part before will get the init code, the part after will be the end
-  // point.
+  // Start by splitting the block containing the 'for' into two parts.  The part
+  // before will get the init code, the part after will be the end point.
   auto *initBlock = forInst->getBlock();
   auto *endBlock = initBlock->splitBlock(forInst);
 
diff --git a/mlir/lib/Transforms/LowerVectorTransfers.cpp b/mlir/lib/Transforms/LowerVectorTransfers.cpp
index e63d3c8111c..63fb45db9c5 100644
--- a/mlir/lib/Transforms/LowerVectorTransfers.cpp
+++ b/mlir/lib/Transforms/LowerVectorTransfers.cpp
@@ -126,9 +126,9 @@ private:
 ///    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into
 ///    // vector<32x256xf32> and pad with %f0 to handle the boundary case:
 ///    %f0 = constant 0.0f : f32
-///    affine.for %i0 = 0 to %0 {
-///      affine.for %i1 = 0 to %1 step 256 {
-///        affine.for %i2 = 0 to %2 step 32 {
+///    for %i0 = 0 to %0 {
+///      for %i1 = 0 to %1 step 256 {
+///        for %i2 = 0 to %2 step 32 {
 ///          %v = vector_transfer_read %A, %i0, %i1, %i2, %f0
 ///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
 ///               (memref<?x?x?xf32>, index, index, f32) -> vector<32x256xf32>
@@ -139,8 +139,8 @@ private:
 /// MLIR resembling:
 ///
 /// ```mlir
-///    affine.for %d1 = 0 to 256 {
-///      affine.for %d2 = 0 to 32 {
+///    for %d1 = 0 to 256 {
+///      for %d2 = 0 to 32 {
 ///        %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32
 ///        %tmp[%d2, %d1] = %s
 ///      }
diff --git a/mlir/lib/Transforms/MaterializeVectors.cpp b/mlir/lib/Transforms/MaterializeVectors.cpp
index 4434ab5322e..be5a03bc416 100644
--- a/mlir/lib/Transforms/MaterializeVectors.cpp
+++ b/mlir/lib/Transforms/MaterializeVectors.cpp
@@ -101,10 +101,10 @@
 ///    mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {
 ///      %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
 ///      %f1 = constant splat<vector<4x4x4xf32>, 1.000000e+00> :
-///      vector<4x4x4xf32> affine.for %i0 = 0 to %M step 4 {
-///        affine.for %i1 = 0 to %N step 4 {
-///          affine.for %i2 = 0 to %O {
-///            affine.for %i3 = 0 to %P step 4 {
+///      vector<4x4x4xf32> for %i0 = 0 to %M step 4 {
+///        for %i1 = 0 to %N step 4 {
+///          for %i2 = 0 to %O {
+///            for %i3 = 0 to %P step 4 {
 ///              vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3
 ///                {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} :
 ///                 vector<4x4x4xf32>, memref<?x?x?x?xf32, 0>,
@@ -120,10 +120,10 @@
 ///    mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) {
 ///      %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
 ///      %f1 = constant splat<vector<4x4xf32>, 1.000000e+00> : vector<4x4x4xf32>
-///       affine.for %i0 = 0 to %arg0 step 4 {
-///         affine.for %i1 = 0 to %arg1 step 4 {
-///           affine.for %i2 = 0 to %arg2 {
-///             affine.for %i3 = 0 to %arg3 step 4 {
+///       for %i0 = 0 to %arg0 step 4 {
+///         for %i1 = 0 to %arg1 step 4 {
+///           for %i2 = 0 to %arg2 {
+///             for %i3 = 0 to %arg3 step 4 {
 ///               %1 = affine.apply (d0, d1, d2, d3) -> (d0, d1, d2, d3)
 ///                    (%i0, %i1, %i2, %i3)
 ///               vector_transfer_write f1, %0, %1#0, %1#1, %1#2, %1#3
@@ -293,10 +293,10 @@ static Value *substitute(Value *v, VectorType hwVectorType,
 /// super-vectorization has been applied:
 ///
 /// ```mlir
-/// affine.for %i0 = 0 to %M {
-///   affine.for %i1 = 0 to %N step 3 {
-///     affine.for %i2 = 0 to %O {
-///       affine.for %i3 = 0 to %P step 32 {
+/// for %i0 = 0 to %M {
+///   for %i1 = 0 to %N step 3 {
+///     for %i2 = 0 to %O {
+///       for %i3 = 0 to %P step 32 {
 ///         %r = vector_transfer_read(%A, map(%i..)#0, map(%i..)#1, map(%i..)#2)
 ///                                   -> vector<3x32xf32>
 ///         ...
diff --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
index 91a17764358..ad9801fea89 100644
--- a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
+++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
@@ -19,7 +19,7 @@
 // potentially getting rid of intermediate memref's entirely.
 // TODO(mlir-team): In the future, similar techniques could be used to eliminate
 // dead memref store's and perform more complex forwarding when support for
-// SSA scalars live out of 'affine.for'/'affine.if' statements is available.
+// SSA scalars live out of 'for'/'affine.if' statements is available.
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/AffineAnalysis.h"
@@ -55,7 +55,7 @@ namespace {
 //
 // (* A dependence being satisfied at a block: a dependence that is satisfied by
 // virtue of the destination instruction appearing textually / lexically after
-// the source instruction within the body of a 'affine.for' instruction; thus, a
+// the source instruction within the body of a 'for' instruction; thus, a
 // dependence is always either satisfied by a loop or by a block).
 //
 // The above conditions are simple to check, sufficient, and powerful for most
@@ -145,8 +145,8 @@ void MemRefDataFlowOpt::forwardStoreToLoad(OpPointer<LoadOp> loadOp) {
       // Check if this store is a candidate for forwarding; we only forward if
       // the dependence from the store is carried by the *body* of innermost
       // common surrounding loop. As an example this filters out cases like:
-      // affine.for %i0
-      //   affine.for %i1
+      // for %i0
+      //   for %i1
       //     %idx = affine.apply (d0) -> (d0 + 1) (%i0)
       //     store %A[%idx]
       //     load %A[%i0]
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index 84c8cd830dc..cfa045f2279 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -71,11 +71,11 @@ static unsigned getTagMemRefPos(const Instruction &dmaInst) {
   return 0;
 }
 
-/// Doubles the buffer of the supplied memref on the specified 'affine.for'
-/// instruction by adding a leading dimension of size two to the memref.
-/// Replaces all uses of the old memref by the new one while indexing the newly
-/// added dimension by the loop IV of the specified 'affine.for' instruction
-/// modulo 2. Returns false if such a replacement cannot be performed.
+/// Doubles the buffer of the supplied memref on the specified 'for' instruction
+/// by adding a leading dimension of size two to the memref. Replaces all uses
+/// of the old memref by the new one while indexing the newly added dimension by
+/// the loop IV of the specified 'for' instruction modulo 2. Returns false if
+/// such a replacement cannot be performed.
 static bool doubleBuffer(Value *oldMemRef, OpPointer<AffineForOp> forOp) {
   auto *forBody = forOp->getBody();
   FuncBuilder bInner(forBody, forBody->begin());
@@ -108,7 +108,7 @@ static bool doubleBuffer(Value *oldMemRef, OpPointer<AffineForOp> forOp) {
                                                    dynamicDimCount++));
   }
 
-  // Create and place the alloc right before the 'affine.for' instruction.
+  // Create and place the alloc right before the 'for' instruction.
   // TODO(mlir-team): we are assuming scoped allocation here, and aren't
   // inserting a dealloc -- this isn't the right thing.
   Value *newMemRef =
@@ -137,9 +137,9 @@ static bool doubleBuffer(Value *oldMemRef, OpPointer<AffineForOp> forOp) {
 /// Returns success if the IR is in a valid state.
 PassResult PipelineDataTransfer::runOnFunction(Function *f) {
   // Do a post order walk so that inner loop DMAs are processed first. This is
-  // necessary since 'affine.for' instructions nested within would otherwise
-  // become invalid (erased) when the outer loop is pipelined (the pipelined one
-  // gets deleted and replaced by a prologue, a new steady-state loop and an
+  // necessary since 'for' instructions nested within would otherwise become
+  // invalid (erased) when the outer loop is pipelined (the pipelined one gets
+  // deleted and replaced by a prologue, a new steady-state loop and an
   // epilogue).
   forOps.clear();
   f->walkPostOrder<AffineForOp>(
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 110949f43d5..a1903ace026 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -138,8 +138,8 @@ void mlir::promoteSingleIterationLoops(Function *f) {
       [](OpPointer<AffineForOp> forOp) { promoteIfSingleIteration(forOp); });
 }
 
-/// Generates a 'affine.for' inst with the specified lower and upper bounds
-/// while generating the right IV remappings for the shifted instructions. The
+/// Generates a 'for' inst with the specified lower and upper bounds while
+/// generating the right IV remappings for the shifted instructions. The
 /// instruction blocks that go into the loop are specified in instGroupQueue
 /// starting from the specified offset, and in that order; the first element of
 /// the pair specifies the shift applied to that group of instructions; note
@@ -194,10 +194,10 @@ generateLoop(AffineMap lbMap, AffineMap ubMap,
   return loopChunk;
 }
 
-/// Skew the instructions in the body of a 'affine.for' instruction with the
-/// specified instruction-wise shifts. The shifts are with respect to the
-/// original execution order, and are multiplied by the loop 'step' before being
-/// applied. A shift of zero for each instruction will lead to no change.
+/// Skew the instructions in the body of a 'for' instruction with the specified
+/// instruction-wise shifts. The shifts are with respect to the original
+/// execution order, and are multiplied by the loop 'step' before being applied.
+/// A shift of zero for each instruction will lead to no change.
 // The skewing of instructions with respect to one another can be used for
 // example to allow overlap of asynchronous operations (such as DMA
 // communication) with computation, or just relative shifting of instructions
@@ -246,7 +246,7 @@ UtilResult mlir::instBodySkew(OpPointer<AffineForOp> forOp,
 
   // An array of instruction groups sorted by shift amount; each group has all
   // instructions with the same shift in the order in which they appear in the
-  // body of the 'affine.for' inst.
+  // body of the 'for' inst.
   std::vector<std::vector<Instruction *>> sortedInstGroups(maxShift + 1);
   unsigned pos = 0;
   for (auto &inst : *forOp->getBody()) {
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
index 90d28bf34df..41689be52fc 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@@ -194,14 +194,14 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
 ///
 /// Before
 ///
-/// affine.for %i = 0 to #map(%N)
+/// for %i = 0 to #map(%N)
 ///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
 ///   "send"(%idx, %A, ...)
 ///   "compute"(%idx)
 ///
 /// After
 ///
-/// affine.for %i = 0 to #map(%N)
+/// for %i = 0 to #map(%N)
 ///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
 ///   "send"(%idx, %A, ...)
 ///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
diff --git a/mlir/lib/Transforms/Vectorize.cpp b/mlir/lib/Transforms/Vectorize.cpp
index 1f4c7b9fcc8..5a8d5d24661 100644
--- a/mlir/lib/Transforms/Vectorize.cpp
+++ b/mlir/lib/Transforms/Vectorize.cpp
@@ -113,7 +113,7 @@ using namespace mlir;
 ///
 /// At a high level, a vectorized load in a loop will resemble:
 /// ```mlir
-///   affine.for %i = ? to ? step ? {
+///   for %i = ? to ? step ? {
 ///     %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) ->
 ///                                              vector<128xf32>
 ///   }
@@ -309,7 +309,7 @@ using namespace mlir;
 /// ```mlir
 /// mlfunc @fill(%A : memref<128xf32>) -> () {
 ///   %f1 = constant 1.0 : f32
-///   affine.for %i0 = 0 to 32 {
+///   for %i0 = 0 to 32 {
 ///     store %f1, %A[%i0] : memref<128xf32, 0>
 ///   }
 ///   return
@@ -322,7 +322,7 @@ using namespace mlir;
 /// is still subject to exploratory tradeoffs. In particular, say we want to
 /// vectorize by a factor 128, we want to transform the following input:
 /// ```mlir
-///   affine.for %i = %M to %N {
+///   for %i = %M to %N {
 ///     %a = load A[%i] : memref<?xf32>
 ///   }
 /// ```
@@ -331,8 +331,8 @@ using namespace mlir;
 /// memory promotion etc) say after stripmining (and potentially unrolling in
 /// the case of LLVM's SLP vectorizer):
 /// ```mlir
-///   affine.for %i = floor(%M, 128) to ceil(%N, 128) {
-///     affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
+///   for %i = floor(%M, 128) to ceil(%N, 128) {
+///     for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
 ///       %a = load A[%ii] : memref<?xf32>
 ///     }
 ///   }
@@ -341,7 +341,7 @@ using namespace mlir;
 /// Instead, we seek to vectorize early and freeze vector types before
 /// scheduling, so we want to generate a pattern that resembles:
 /// ```mlir
-///   affine.for %i = ? to ? step ? {
+///   for %i = ? to ? step ? {
 ///     %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) ->
 ///                                              vector<128xf32>
 ///   }
@@ -362,7 +362,7 @@ using namespace mlir;
 /// For the simple strawman example above, vectorizing for a 1-D vector
 /// abstraction of size 128 returns code similar to:
 /// ```mlir
-///   affine.for %i = %M to %N step 128 {
+///   for %i = %M to %N step 128 {
 ///     %v_a = "vector_transfer_read" (A, %i) : (memref<?xf32>, index) ->
 ///                                              vector<128xf32>
 ///   }
@@ -391,20 +391,20 @@ using namespace mlir;
 ///   %C = alloc (%M, %N) : memref<?x?xf32, 0>
 ///   %f1 = constant 1.0 : f32
 ///   %f2 = constant 2.0 : f32
-///   affine.for %i0 = 0 to %M {
-///     affine.for %i1 = 0 to %N {
+///   for %i0 = 0 to %M {
+///     for %i1 = 0 to %N {
 ///       // non-scoped %f1
 ///       store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
 ///     }
 ///   }
-///   affine.for %i2 = 0 to %M {
-///     affine.for %i3 = 0 to %N {
+///   for %i2 = 0 to %M {
+///     for %i3 = 0 to %N {
 ///       // non-scoped %f2
 ///       store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
 ///     }
 ///   }
-///   affine.for %i4 = 0 to %M {
-///     affine.for %i5 = 0 to %N {
+///   for %i4 = 0 to %M {
+///     for %i5 = 0 to %N {
 ///       %a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
 ///       %b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
 ///       %s5 = addf %a5, %b5 : f32
@@ -438,24 +438,24 @@ using namespace mlir;
 ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
 ///   %cst = constant 1.0 : f32
 ///   %cst_0 = constant 2.0 : f32
-///   affine.for %i0 = 0 to %arg0 {
-///     affine.for %i1 = 0 to %arg1 step 256 {
+///   for %i0 = 0 to %arg0 {
+///     for %i1 = 0 to %arg1 step 256 {
 ///       %cst_1 = constant splat<vector<256xf32>, 1.0> :
 ///                vector<256xf32>
 ///       "vector_transfer_write"(%cst_1, %0, %i0, %i1) :
 ///                (vector<256xf32>, memref<?x?xf32>, index, index) -> ()
 ///     }
 ///   }
-///   affine.for %i2 = 0 to %arg0 {
-///     affine.for %i3 = 0 to %arg1 step 256 {
+///   for %i2 = 0 to %arg0 {
+///     for %i3 = 0 to %arg1 step 256 {
 ///       %cst_2 = constant splat<vector<256xf32>, 2.0> :
 ///                vector<256xf32>
 ///       "vector_transfer_write"(%cst_2, %1, %i2, %i3) :
 ///                (vector<256xf32>, memref<?x?xf32>, index, index) -> ()
 ///     }
 ///   }
-///   affine.for %i4 = 0 to %arg0 {
-///     affine.for %i5 = 0 to %arg1 step 256 {
+///   for %i4 = 0 to %arg0 {
+///     for %i5 = 0 to %arg1 step 256 {
 ///       %3 = "vector_transfer_read"(%0, %i4, %i5) :
 ///                      (memref<?x?xf32>, index, index) -> vector<256xf32>
 ///       %4 = "vector_transfer_read"(%1, %i4, %i5) :
@@ -494,24 +494,24 @@ using namespace mlir;
 ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
 ///   %cst = constant 1.0 : f32
 ///   %cst_0 = constant 2.0 : f32
-///   affine.for %i0 = 0 to %arg0 step 32 {
-///     affine.for %i1 = 0 to %arg1 step 256 {
+///   for %i0 = 0 to %arg0 step 32 {
+///     for %i1 = 0 to %arg1 step 256 {
 ///       %cst_1 = constant splat<vector<32x256xf32>, 1.0> :
 ///                vector<32x256xf32>
 ///       "vector_transfer_write"(%cst_1, %0, %i0, %i1) :
 ///                (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
 ///     }
 ///   }
-///   affine.for %i2 = 0 to %arg0 step 32 {
-///     affine.for %i3 = 0 to %arg1 step 256 {
+///   for %i2 = 0 to %arg0 step 32 {
+///     for %i3 = 0 to %arg1 step 256 {
 ///       %cst_2 = constant splat<vector<32x256xf32>, 2.0> :
 ///                vector<32x256xf32>
 ///       "vector_transfer_write"(%cst_2, %1, %i2, %i3) :
 ///                (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
 ///     }
 ///   }
-///   affine.for %i4 = 0 to %arg0 step 32 {
-///     affine.for %i5 = 0 to %arg1 step 256 {
+///   for %i4 = 0 to %arg0 step 32 {
+///     for %i5 = 0 to %arg1 step 256 {
 ///       %3 = "vector_transfer_read"(%0, %i4, %i5) :
 ///                (memref<?x?xf32>, index, index) -> vector<32x256xf32>
 ///       %4 = "vector_transfer_read"(%1, %i4, %i5) :