21 files changed, 597 insertions, 908 deletions
diff --git a/mlir/g3doc/Dialects/Vector.md b/mlir/g3doc/Dialects/Vector.md
index 4607c32fe74..04f5ba71cdb 100644
--- a/mlir/g3doc/Dialects/Vector.md
+++ b/mlir/g3doc/Dialects/Vector.md
@@ -6,175 +6,9 @@ This dialect provides mid-level abstraction for the MLIR super-vectorizer.
 
 ## Operations
 
-### Vector transfers
+# To see op documentation
 
-#### `vector.transfer_read` operation
-
-Syntax:
-
-``` {.ebnf}
-operation ::= ssa-id `=` `vector.transfer_read` ssa-use-list `{` attribute-entry `} :` function-type
-```
-
-Examples:
-
-```mlir {.mlir}
-// Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into vector<32x256xf32> and
-// pad with %f0 to handle the boundary case:
-%f0 = constant 0.0f : f32
-for %i0 = 0 to %0 {
-  affine.for %i1 = 0 to %1 step 256 {
-    affine.for %i2 = 0 to %2 step 32 {
-      %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0)
-           {permutation_map: (d0, d1, d2) -> (d2, d1)} :
-           memref<?x?x?xf32>, vector<32x256xf32>
-}}}
-
-// Read the slice `%A[%i0, %i1]` (i.e. the element `%A[%i0, %i1]`) into
-// vector<128xf32>. The underlying implementation will require a 1-D vector
-// broadcast:
-for %i0 = 0 to %0 {
-  affine.for %i1 = 0 to %1 {
-    %3 = vector.transfer_read %A[%i0, %i1]
-         {permutation_map: (d0, d1) -> (0)} :
-         memref<?x?xf32>, vector<128xf32>
-  }
-}
-```
-
-The `vector.transfer_read` performs a blocking read from a slice within a scalar
-[MemRef](../LangRef.md#memref-type) supplied as its first operand into a
-[vector](../LangRef.md#vector-type) of the same elemental type. The slice is
-further defined by a full-rank index within the MemRef, supplied as the operands
-`2 .. 1 + rank(memref)`. The permutation_map [attribute](../LangRef.md#attributes)
-is an [affine-map](Affine.md#affine-maps) which specifies the transposition on
-the slice to match the vector shape. The size of the slice is specified by the
-size of the vector, given as the return type. Optionally, an `ssa-value` of the
-same elemental type as the MemRef is provided as the last operand to specify
-padding in the case of out-of-bounds accesses. Absence of the optional padding
-value signifies the `vector.transfer_read` is statically guaranteed to remain
-within the MemRef bounds. This operation is called 'read' by opposition to
-'load' because the super-vector granularity is generally not representable with
-a single hardware register. A `vector.transfer_read` is thus a mid-level
-abstraction that supports super-vectorization with non-effecting padding for
-full-tile-only code.
-
-More precisely, let's dive deeper into the permutation_map for the following :
-
-```mlir {.mlir}
-vector.transfer_read %A[%expr1, %expr2, %expr3, %expr4]
-  { permutation_map : (d0,d1,d2,d3) -> (d2,0,d0) } :
-  memref<?x?x?x?xf32>, vector<3x4x5xf32>
-```
-
-This operation always reads a slice starting at `%A[%expr1, %expr2, %expr3,
-%expr4]`. The size of the slice is 3 along d2 and 5 along d0, so the slice is:
-`%A[%expr1 : %expr1 + 5, %expr2, %expr3:%expr3 + 3, %expr4]`
-
-That slice needs to be read into a `vector<3x4x5xf32>`. Since the permutation
-map is not full rank, there must be a broadcast along vector dimension `1`.
-
-A notional lowering of vector.transfer_read could generate code resembling:
-
-```mlir {.mlir}
-// %expr1, %expr2, %expr3, %expr4 defined before this point
-%tmp = alloc() : vector<3x4x5xf32>
-%view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
-for %i = 0 to 3 {
-  affine.for %j = 0 to 4 {
-    affine.for %k = 0 to 5 {
-      %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] : memref<?x?x?x?xf32>
-      store %tmp[%i, %j, %k] : vector<3x4x5xf32>
-}}}
-%c0 = constant 0 : index
-%vec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
-```
-
-On a GPU one could then map `i`, `j`, `k` to blocks and threads. Notice that the
-temporary storage footprint is `3 * 5` values but `3 * 4 * 5` values are
-actually transferred between `%A` and `%tmp`.
-
-Alternatively, if a notional vector broadcast operation were available, the
-lowered code would resemble:
-
-```mlir {.mlir}
-// %expr1, %expr2, %expr3, %expr4 defined before this point
-%tmp = alloc() : vector<3x4x5xf32>
-%view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
-for %i = 0 to 3 {
-  affine.for %k = 0 to 5 {
-    %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] : memref<?x?x?x?xf32>
-    store %tmp[%i, 0, %k] : vector<3x4x5xf32>
-}}
-%c0 = constant 0 : index
-%tmpvec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
-%vec = broadcast %tmpvec, 1 : vector<3x4x5xf32>
+```sh
+mlir-tblgen --gen-op-doc -I /path/to/mlir/include \
+/path/to/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
 ```
-
-where `broadcast` broadcasts from element 0 to all others along the specified
-dimension. This time, the temporary storage footprint is `3 * 5` values which is
-the same amount of data as the `3 * 5` values transferred. An additional `1`
-broadcast is required. On a GPU this broadcast could be implemented using a
-warp-shuffle if loop `j` were mapped to `threadIdx.x`.
-
-#### `vector.transfer_write` operation
-
-Syntax:
-
-``` {.ebnf}
-operation ::= `vector.transfer_write` ssa-use-list `{` attribute-entry `} :` vector-type ', ' memref-type ', ' index-type-list
-```
-
-Examples:
-
-```mlir {.mlir}
-// write vector<16x32x64xf32> into the slice `%A[%i0, %i1:%i1+32, %i2:%i2+64, %i3:%i3+16]`:
-for %i0 = 0 to %0 {
-  affine.for %i1 = 0 to %1 step 32 {
-    affine.for %i2 = 0 to %2 step 64 {
-      affine.for %i3 = 0 to %3 step 16 {
-        %val = `ssa-value` : vector<16x32x64xf32>
-        vector.transfer_write %val, %A[%i0, %i1, %i2, %i3]
-          {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
-          vector<16x32x64xf32>, memref<?x?x?x?xf32>
-}}}}
-```
-
-The `vector.transfer_write` performs a blocking write from a
-[vector](../LangRef.md#vector-type), supplied as its first operand, into a slice
-within a scalar [MemRef](../LangRef.md#memref-type) of the same elemental type,
-supplied as its second operand. The slice is further defined by a full-rank
-index within the MemRef, supplied as the operands `3 .. 2 + rank(memref)`. The
-permutation_map [attribute](../LangRef.md#attributes) is an
-[affine-map](Affine.md#affine-maps) which specifies the transposition on the
-slice to match the vector shape. The size of the slice is specified by the size
-of the vector. This operation is called 'write' by opposition to 'store' because
-the super-vector granularity is generally not representable with a single
-hardware register. A `vector.transfer_write` is thus a mid-level abstraction
-that supports super-vectorization with non-effecting padding for full-tile-only
-code. It is the responsibility of `vector.transfer_write`'s implementation to
-ensure the memory writes are valid. Different lowerings may be pertinent
-depending on the hardware support.
-
-### Vector views
-
-#### `vector.type_cast` operation
-
-Syntax:
-
-``` {.ebnf}
-operation ::= `vector.type_cast` ssa-use : memref-type, memref-type
-```
-
-Examples:
-
-```mlir
- %A  = alloc() : memref<5x4x3xf32>
- %VA = vector.type_cast %A : memref<5x4x3xf32>, memref<1xvector<5x4x3xf32>>
-```
-
-The `vector.type_cast` operation performs a conversion from a memref with scalar
-element to memref with a *single* vector element, copying the shape of the
-memref to the vector. This is the minimal viable operation that is required to
-make super-vectorization operational. It can be seen as a special case of the
-`view` operation but scoped in the super-vectorization context.
diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.h b/mlir/include/mlir/Dialect/VectorOps/VectorOps.h
index 1d29567a1d6..668eaa5c9d5 100644
--- a/mlir/include/mlir/Dialect/VectorOps/VectorOps.h
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorOps.h
@@ -15,8 +15,7 @@
 // limitations under the License.
 // =============================================================================
 //
-// This file defines convenience types for working with super-vectorization
-// operations, in particular super-vector loads and stores.
+// This file defines the Vector dialect.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,178 +30,13 @@
 namespace mlir {
 namespace vector {
 
-/// Dialect for super-vectorization Ops.
+/// Dialect for Ops on higher-dimensional vector types.
 class VectorOpsDialect : public Dialect {
 public:
   VectorOpsDialect(MLIRContext *context);
   static StringRef getDialectNamespace() { return "vector"; }
 };
 
-/// VectorTransferReadOp performs a blocking read from a scalar memref
-/// location into a super-vector of the same elemental type. This operation is
-/// called 'read' by opposition to 'load' because the super-vector granularity
-/// is generally not representable with a single hardware register. As a
-/// consequence, memory transfers will generally be required when lowering
-/// VectorTransferReadOp. A VectorTransferReadOp is thus a mid-level abstraction
-/// that supports super-vectorization with non-effecting padding for full-tile
-/// only code.
-//
-/// A vector transfer read has semantics similar to a vector load, with
-/// additional support for:
-///   1. an optional value of the elemental type of the MemRef. This value
-///      supports non-effecting padding and is inserted in places where the
-///      vector read exceeds the MemRef bounds. If the value is not specified,
-///      the access is statically guaranteed to be within bounds;
-///   2. an attribute of type AffineMap to specify a slice of the original
-///      MemRef access and its transposition into the super-vector shape.
-///      The permutation_map is an AffineMap that must represent a permutation
-///      from the MemRef dim space projected onto the vector dim space.
-///      This permutation_map has as many output dimensions as the vector rank.
-///      However, it is not necessarily full rank on the target space to signify
-///      that broadcast operations will be needed along certain vector
-///      dimensions.
-///      In the limit, one may load a 0-D slice of a memref (i.e. a single
-///      value) into a vector, which corresponds to broadcasting that value in
-///      the whole vector (i.e. a non-constant splat).
-///
-/// Example with full rank permutation_map:
-/// ```mlir
-///   %A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
-///   ...
-///   %val = `ssa-value` : f32
-///   // let %i, %j, %k, %l be ssa-values of type index
-///   %v0 = vector.transfer_read %src[%i, %j, %k, %l]
-///          {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
-///         memref<?x?x?x?xf32>, vector<16x32x64xf32>
-///   %v1 = vector.transfer_read %src[%i, %j, %k, %l], (%val)
-///          {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
-///         memref<?x?x?x?xf32>, vector<16x32x64xf32>
-/// ```
-///
-/// Example with partial rank permutation_map:
-/// ```mlir
-///   %c0 = constant 0 : index
-///   %A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
-///   ...
-///   // let %i, %j be ssa-values of type index
-///   %v0 = vector.transfer_read %src[%i, %c0, %c0, %c0]
-///          {permutation_map: (d0, d1, d2, d3) -> (0, d1, 0)} :
-///         memref<?x?x?x?xf32>, vector<16x32x64xf32>
-class VectorTransferReadOp
-    : public Op<VectorTransferReadOp, OpTrait::VariadicOperands,
-                OpTrait::OneResult> {
-  enum Offsets : unsigned { MemRefOffset = 0, FirstIndexOffset = 1 };
-
-public:
-  using Op::Op;
-
-  static StringRef getOperationName() { return "vector.transfer_read"; }
-  static StringRef getPermutationMapAttrName() { return "permutation_map"; }
-  static void build(Builder *builder, OperationState &result,
-                    VectorType vectorType, Value *srcMemRef,
-                    ArrayRef<Value *> srcIndices, AffineMap permutationMap,
-                    Optional<Value *> paddingValue = None);
-  VectorType getResultType() {
-    return getResult()->getType().cast<VectorType>();
-  }
-  Value *getVector() { return getResult(); }
-  Value *getMemRef() { return getOperand(Offsets::MemRefOffset); }
-  VectorType getVectorType() { return getResultType(); }
-  MemRefType getMemRefType() {
-    return getMemRef()->getType().cast<MemRefType>();
-  }
-  operand_range getIndices();
-  Optional<Value *> getPaddingValue();
-  AffineMap getPermutationMap();
-
-  static ParseResult parse(OpAsmParser &parser, OperationState &result);
-  void print(OpAsmPrinter &p);
-  LogicalResult verify();
-};
-
-/// VectorTransferWriteOp performs a blocking write from a super-vector to
-/// a scalar memref of the same elemental type. This operation is
-/// called 'write' by opposition to 'store' because the super-vector granularity
-/// is generally not representable with a single hardware register. As a
-/// consequence, memory transfers will generally be required when lowering
-/// VectorTransferWriteOp. A VectorTransferWriteOp is thus a mid-level
-/// abstraction that supports super-vectorization with non-effecting padding for
-/// full-tile only code.
-///
-/// A vector transfer write has semantics similar to a vector store, with
-/// additional support for handling out-of-bounds situations. It is the
-/// responsibility of vector.transfer_write's implementation to ensure the
-/// memory writes are valid. Different implementations may be pertinent
-/// depending on the hardware support including:
-/// 1. predication;
-/// 2. explicit control-flow;
-/// 3. Read-Modify-Write;
-/// 4. writing out of bounds of the memref when the allocation allows it.
-///
-/// Example:
-/// ```mlir
-///   %A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>.
-///   %val = `ssa-value` : vector<16x32x64xf32>
-///   // let %i, %j, %k, %l be ssa-values of type index
-///   vector.transfer_write %val, %src[%i, %j, %k, %l]
-///     {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
-///   vector<16x32x64xf32>, memref<?x?x?x?xf32>
-/// ```
-class VectorTransferWriteOp
-    : public Op<VectorTransferWriteOp, OpTrait::VariadicOperands,
-                OpTrait::ZeroResult> {
-  enum Offsets : unsigned {
-    VectorOffset = 0,
-    MemRefOffset = 1,
-    FirstIndexOffset = 2
-  };
-
-public:
-  using Op::Op;
-
-  static StringRef getOperationName() { return "vector.transfer_write"; }
-  static StringRef getPermutationMapAttrName() { return "permutation_map"; }
-  static void build(Builder *builder, OperationState &result, Value *srcVector,
-                    Value *dstMemRef, ArrayRef<Value *> dstIndices,
-                    AffineMap permutationMap);
-  Value *getVector() { return getOperand(Offsets::VectorOffset); }
-  VectorType getVectorType() {
-    return getVector()->getType().cast<VectorType>();
-  }
-  Value *getMemRef() { return getOperand(Offsets::MemRefOffset); }
-  MemRefType getMemRefType() {
-    return getMemRef()->getType().cast<MemRefType>();
-  }
-  operand_range getIndices();
-  AffineMap getPermutationMap();
-
-  static ParseResult parse(OpAsmParser &parser, OperationState &result);
-  void print(OpAsmPrinter &p);
-  LogicalResult verify();
-};
-
-/// VectorTypeCastOp performs a conversion from a memref with scalar element to
-/// memref with vector element, copying the shape of the memref to the vector.
-///
-/// Example:
-///
-/// ```mlir
-///  %A  = alloc() : memref<5x4x3xf32>
-///  %VA = vector.type_cast %A : memref<5x4x3xf32>, memref<1xvector<5x4x3xf32>>
-/// ```
-class VectorTypeCastOp
-    : public Op<VectorTypeCastOp, OpTrait::OneOperand, OpTrait::OneResult> {
-public:
-  using Op::Op;
-
-  static StringRef getOperationName() { return "vector.type_cast"; }
-  static void build(Builder *builder, OperationState &result, Value *srcVector,
-                    Type dstType);
-  static ParseResult parse(OpAsmParser &parser, OperationState &result);
-  void print(OpAsmPrinter &p);
-  LogicalResult verify();
-};
-
 #define GET_OP_CLASSES
 #include "mlir/Dialect/VectorOps/VectorOps.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
index 032312e72cf..125ecac57d8 100644
--- a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
@@ -26,6 +26,10 @@
 include "mlir/IR/OpBase.td"
 #endif // OP_BASE
 
+#ifndef AFFINE_OPS_BASE
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+#endif // AFFINE_OPS_BASE
+
 def Vector_Dialect : Dialect {
   let name = "vector";
   let cppNamespace = "vector";
@@ -68,6 +72,7 @@ def ExtractElementOp :
     }
   }];
 }
+
 def OuterProductOp :
   Vector_Op<"outerproduct", [NoSideEffect, SameOperandsAndResultElementType]>,
     Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, Variadic<AnyVector>:$acc)>,
@@ -106,4 +111,239 @@ def OuterProductOp :
     }
   }];
 }
+
+def VectorTransferReadOp :
+  Vector_Op<"transfer_read">,
+    Arguments<(ins AnyMemRef:$memref, Variadic<Index>:$indices,
+               AffineMapAttr:$permutation_map, AnyType:$padding)>,
+    Results<(outs AnyVector:$vector)> {
+
+  let summary = "Reads a supervector from memory into an SSA vector value.";
+
+  let description = [{
+    The `vector.transfer_read` op performs a blocking read from a slice within
+    a scalar [MemRef](../LangRef.md#memref-type) supplied as its first operand
+    into a [vector](../LangRef.md#vector-type) of the same elemental type. The
+    slice is further defined by a full-rank index within the MemRef, supplied as
+    the operands `2 .. 1 + rank(memref)`. The permutation_map
+    [attribute](../LangRef.md#attributes) is an
+    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
+    slice to match the vector shape. The size of the slice is specified by the
+    size of the vector, given as the return type. An `ssa-value` of the same
+    elemental type as the MemRef is provided as the last operand to specify
+    padding in the case of out-of-bounds accesses. This operation is called
+    'read' by opposition to 'load' because the super-vector granularity is
+    generally not representable with a single hardware register.
+    A `vector.transfer_read` is thus a mid-level
+    abstraction that supports super-vectorization with non-effecting padding for
+    full-tile-only code.
+
+    More precisely, let's dive deeper into the permutation_map for the following
+    MLIR:
+
+    ```mlir {.mlir}
+    vector.transfer_read %A[%expr1, %expr2, %expr3, %expr4]
+      { permutation_map : (d0,d1,d2,d3) -> (d2,0,d0) } :
+      memref<?x?x?x?xf32>, vector<3x4x5xf32>
+    ```
+
+    This operation always reads a slice starting at `%A[%expr1, %expr2, %expr3,
+    %expr4]`. The size of the slice is 3 along d2 and 5 along d0, so the slice
+    is: `%A[%expr1 : %expr1 + 5, %expr2, %expr3:%expr3 + 3, %expr4]`
+
+    That slice needs to be read into a `vector<3x4x5xf32>`. Since the
+    permutation map is not full rank, there must be a broadcast along vector
+    dimension `1`.
+
+    A notional lowering of vector.transfer_read could generate code resembling:
+
+    ```mlir
+    // %expr1, %expr2, %expr3, %expr4 defined before this point
+    %tmp = alloc() : vector<3x4x5xf32>
+    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
+    for %i = 0 to 3 {
+      affine.for %j = 0 to 4 {
+        affine.for %k = 0 to 5 {
+          %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
+            memref<?x?x?x?xf32>
+          store %tmp[%i, %j, %k] : vector<3x4x5xf32>
+    }}}
+    %c0 = constant 0 : index
+    %vec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
+    ```
+
+    On a GPU one could then map `i`, `j`, `k` to blocks and threads. Notice that
+    the temporary storage footprint is `3 * 5` values but `3 * 4 * 5` values are
+    actually transferred between `%A` and `%tmp`.
+
+    Alternatively, if a notional vector broadcast operation were available, the
+    lowered code would resemble:
+
+    ```mlir
+    // %expr1, %expr2, %expr3, %expr4 defined before this point
+    %tmp = alloc() : vector<3x4x5xf32>
+    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
+    for %i = 0 to 3 {
+      affine.for %k = 0 to 5 {
+        %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
+          memref<?x?x?x?xf32>
+        store %tmp[%i, 0, %k] : vector<3x4x5xf32>
+    }}
+    %c0 = constant 0 : index
+    %tmpvec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
+    %vec = broadcast %tmpvec, 1 : vector<3x4x5xf32>
+    ```
+
+    where `broadcast` broadcasts from element 0 to all others along the
+    specified dimension. This time, the temporary storage footprint is `3 * 5`
+    values which is the same amount of data as the `3 * 5` values transferred.
+    An additional `1` broadcast is required. On a GPU this broadcast could be
+    implemented using a warp-shuffle if loop `j` were mapped to `threadIdx.x`.
+
+    Syntax
+    ``` {.ebnf}
+    operation ::= ssa-id `=` `vector.transfer_read` ssa-use-list
+      `{` attribute-entry `} :` memref-type `,` vector-type
+    ```
+
+    Examples:
+
+    ```mlir
+    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into vector<32x256xf32>
+    // and pad with %f0 to handle the boundary case:
+    %f0 = constant 0.0f : f32
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 step 256 {
+        affine.for %i2 = 0 to %2 step 32 {
+          %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0)
+               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
+               memref<?x?x?xf32>, vector<32x256xf32>
+    }}}
+
+    // Read the slice `%A[%i0, %i1]` (i.e. the element `%A[%i0, %i1]`) into
+    // vector<128xf32>. The underlying implementation will require a 1-D vector
+    // broadcast:
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 {
+        %3 = vector.transfer_read %A[%i0, %i1]
+             {permutation_map: (d0, d1) -> (0)} :
+             memref<?x?xf32>, vector<128xf32>
+      }
+    }
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def VectorTransferWriteOp :
+  Vector_Op<"transfer_write">,
+    Arguments<(ins AnyVector:$vector, AnyMemRef:$memref,
+               Variadic<Index>:$indices,
+               AffineMapAttr:$permutation_map)> {
+
+  let summary = "The vector.transfer_write op writes a supervector to memory.";
+
+  let description = [{
+    The `vector.transfer_write` performs a blocking write from a
+    [vector](../LangRef.md#vector-type), supplied as its first operand, into a
+    slice within a scalar [MemRef](../LangRef.md#memref-type) of the same
+    elemental type, supplied as its second operand. The slice is further defined
+    by a full-rank index within the MemRef, supplied as the operands
+    `3 .. 2 + rank(memref)`.
+    The permutation_map [attribute](../LangRef.md#attributes) is an
+    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
+    slice to match the vector shape. The size of the slice is specified by the
+    size of the vector. This operation is called 'write' by opposition to
+    'store' because the super-vector granularity is generally not representable
+    with a single hardware register. A `vector.transfer_write` is thus a
+    mid-level abstraction that supports super-vectorization with non-effecting
+    padding for full-tile-only code. It is the responsibility of
+    `vector.transfer_write`'s implementation to ensure the memory writes are
+    valid. Different lowerings may be pertinent depending on the hardware
+    support.
+
+    Syntax:
+
+    ``` {.ebnf}
+    operation ::= `vector.transfer_write` ssa-use-list `{` attribute-entry `} :
+      ` vector-type ', ' memref-type '
+    ```
+
+    Examples:
+
+    ```mlir {.mlir}
+    // write vector<16x32x64xf32> into the slice
+    //   `%A[%i0, %i1:%i1+32, %i2:%i2+64, %i3:%i3+16]`:
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 step 32 {
+        affine.for %i2 = 0 to %2 step 64 {
+          affine.for %i3 = 0 to %3 step 16 {
+            %val = `ssa-value` : vector<16x32x64xf32>
+            vector.transfer_write %val, %A[%i0, %i1, %i2, %i3]
+              {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
+              vector<16x32x64xf32>, memref<?x?x?x?xf32>
+    }}}}
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+  }];
+}
+
+def VectorTypeCastOp :
+  Vector_Op<"type_cast", [NoSideEffect]>,
+    Arguments<(ins StaticShapeMemRefOf<[AnyType]>:$memref)>,
+    Results<(outs AnyMemRef)> {
+  let summary = "type_cast op converts a scalar memref to a vector memref";
+  let description = [{
+    Performs a conversion from a memref with scalar element to a memref with a
+    *single* vector element, copying the shape of the memref to the vector. This
+    is the minimal viable operation that is required to makeke
+    super-vectorization operational. It can be seen as a special case of the
+    `view` operation but scoped in the super-vectorization context.
+
+    Syntax:
+
+    ``` {.ebnf}
+    operation ::= `vector.type_cast` ssa-use : memref-type to memref-type
+    ```
+
+    Example:
+
+    ```mlir
+    %A  = alloc() : memref<5x4x3xf32>
+    %VA = vector.type_cast %A : memref<5x4x3xf32> to memref<vector<5x4x3xf32>>
+    ```
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value *source">];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+    MemRefType getResultMemRefType() {
+      return getResult()->getType().cast<MemRefType>();
+    }
+  }];
+}
 #endif // VECTOR_OPS
diff --git a/mlir/lib/Analysis/VectorAnalysis.cpp b/mlir/lib/Analysis/VectorAnalysis.cpp
index e765ce35e74..2dab3481e56 100644
--- a/mlir/lib/Analysis/VectorAnalysis.cpp
+++ b/mlir/lib/Analysis/VectorAnalysis.cpp
@@ -195,7 +195,7 @@ bool mlir::matcher::operatesOnSuperVectorsOf(Operation &op,
   (void)mustDivide;
   VectorType superVectorType;
   if (auto read = dyn_cast<vector::VectorTransferReadOp>(op)) {
-    superVectorType = read.getResultType();
+    superVectorType = read.getVectorType();
     mustDivide = true;
   } else if (auto write = dyn_cast<vector::VectorTransferWriteOp>(op)) {
     superVectorType = write.getVectorType();
diff --git a/mlir/lib/Conversion/VectorToLLVM/VectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/VectorToLLVM.cpp
index 5ccf740f2fb..21bcdc9a6db 100644
--- a/mlir/lib/Conversion/VectorToLLVM/VectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/VectorToLLVM.cpp
@@ -196,10 +196,10 @@ public:
     int64_t offset;
     SmallVector<int64_t, 4> strides;
     auto successStrides =
-        getStridesAndOffset(targetMemRefType, strides, offset);
+        getStridesAndOffset(sourceMemRefType, strides, offset);
     bool isContiguous = (strides.back() == 1);
     if (isContiguous) {
-      auto sizes = targetMemRefType.getShape();
+      auto sizes = sourceMemRefType.getShape();
       for (int index = 0, e = strides.size() - 2; index < e; ++index) {
         if (strides[index] != strides[index + 1] * sizes[index + 1]) {
           isContiguous = false;
@@ -207,7 +207,7 @@ public:
         }
       }
     }
-    // Only contiguous tensors supported atm.
+    // Only contiguous source tensors supported atm.
     if (failed(successStrides) || !isContiguous)
       return matchFailure();
 
diff --git a/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/mlir/lib/Dialect/VectorOps/VectorOps.cpp
index 8626f241955..215e92d0947 100644
--- a/mlir/lib/Dialect/VectorOps/VectorOps.cpp
+++ b/mlir/lib/Dialect/VectorOps/VectorOps.cpp
@@ -37,8 +37,6 @@ using namespace mlir::vector;
 
 mlir::vector::VectorOpsDialect::VectorOpsDialect(MLIRContext *context)
     : Dialect(getDialectNamespace(), context) {
-  addOperations<VectorTransferReadOp, VectorTransferWriteOp,
-                VectorTypeCastOp>();
   addOperations<
 #define GET_OP_LIST
 #include "mlir/Dialect/VectorOps/VectorOps.cpp.inc"
@@ -195,354 +193,165 @@ static LogicalResult verifyPermutationMap(AffineMap permutationMap,
   return success();
 }
 
-void VectorTransferReadOp::build(Builder *builder, OperationState &result,
-                                 VectorType vectorType, Value *srcMemRef,
-                                 ArrayRef<Value *> srcIndices,
-                                 AffineMap permutationMap,
-                                 Optional<Value *> paddingValue) {
-  result.addOperands(srcMemRef);
-  result.addOperands(srcIndices);
-  if (paddingValue) {
-    result.addOperands({*paddingValue});
-  }
-  result.addAttribute(getPermutationMapAttrName(),
-                      AffineMapAttr::get(permutationMap));
-  result.addTypes(vectorType);
-}
-
-auto VectorTransferReadOp::getIndices() -> operand_range {
-  auto begin = getOperation()->operand_begin() + Offsets::FirstIndexOffset;
-  auto end = begin + getMemRefType().getRank();
-  return {begin, end};
-}
-
-Optional<Value *> VectorTransferReadOp::getPaddingValue() {
-  auto memRefRank = getMemRefType().getRank();
-  if (getNumOperands() <= Offsets::FirstIndexOffset + memRefRank) {
-    return None;
-  }
-  return Optional<Value *>(getOperand(Offsets::FirstIndexOffset + memRefRank));
-}
-
-AffineMap VectorTransferReadOp::getPermutationMap() {
-  return getAttrOfType<AffineMapAttr>(getPermutationMapAttrName()).getValue();
-}
-
-void VectorTransferReadOp::print(OpAsmPrinter &p) {
-  p << getOperationName() << " ";
-  p.printOperand(getMemRef());
+static void print(OpAsmPrinter &p, VectorTransferReadOp op) {
+  p << op.getOperationName() << " ";
+  p.printOperand(op.memref());
   p << "[";
-  p.printOperands(getIndices());
-  p << "]";
-  auto optionalPaddingValue = getPaddingValue();
-  if (optionalPaddingValue) {
-    p << ", (";
-    p.printOperand(*optionalPaddingValue);
-    p << ")";
-  }
-  p.printOptionalAttrDict(getAttrs());
-  p << " : " << getMemRefType();
-  p << ", " << getResultType();
+  p.printOperands(op.indices());
+  p << "], ";
+  p.printOperand(op.padding());
+  p << " ";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getMemRefType();
+  p << ", " << op.getVectorType();
 }
 
-ParseResult VectorTransferReadOp::parse(OpAsmParser &parser,
-                                        OperationState &result) {
+ParseResult parseVectorTransferReadOp(OpAsmParser &parser,
+                                      OperationState &result) {
+  llvm::SMLoc typesLoc;
   OpAsmParser::OperandType memrefInfo;
   SmallVector<OpAsmParser::OperandType, 8> indexInfo;
-  SmallVector<OpAsmParser::OperandType, 8> paddingInfo;
+  OpAsmParser::OperandType paddingInfo;
   SmallVector<Type, 2> types;
-
   // Parsing with support for optional paddingValue.
   if (parser.parseOperand(memrefInfo) ||
       parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
-      parser.parseTrailingOperandList(paddingInfo,
-                                      OpAsmParser::Delimiter::Paren) ||
+      parser.parseComma() || parser.parseOperand(paddingInfo) ||
       parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonTypeList(types))
+      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
     return failure();
-
-  // Resolution.
   if (types.size() != 2)
-    return parser.emitError(parser.getNameLoc(), "expected 2 types");
-  MemRefType memrefType = types[0].dyn_cast<MemRefType>();
-  if (!memrefType)
-    return parser.emitError(parser.getNameLoc(), "memRef type expected");
-  VectorType vectorType = types[1].dyn_cast<VectorType>();
-  if (!vectorType)
-    return parser.emitError(parser.getNameLoc(), "vector type expected");
-
-  // Extract optional paddingValue.
-  // At this point, indexInfo may contain the optional paddingValue, pop it
-  // out.
-  if (static_cast<int64_t>(indexInfo.size()) != memrefType.getRank())
-    return parser.emitError(parser.getNameLoc(),
-                            "expected " + Twine(memrefType.getRank()) +
-                                " indices to the memref");
-  if (paddingInfo.size() > 1)
-    return parser.emitError(parser.getNameLoc(),
-                            "expected at most one padding value");
-  Type paddingType;
-  bool hasOptionalPaddingValue = !paddingInfo.empty();
-  if (hasOptionalPaddingValue) {
-    paddingType = vectorType.getElementType();
-  }
+    return parser.emitError(typesLoc, "two types required");
   auto indexType = parser.getBuilder().getIndexType();
+  MemRefType memRefType = types[0].dyn_cast<MemRefType>();
+  if (!memRefType)
+    return parser.emitError(typesLoc, "memref type required"), failure();
+  Type vectorType = types[1];
   return failure(
-      parser.resolveOperand(memrefInfo, memrefType, result.operands) ||
+      parser.resolveOperand(memrefInfo, memRefType, result.operands) ||
       parser.resolveOperands(indexInfo, indexType, result.operands) ||
-      (hasOptionalPaddingValue &&
-       parser.resolveOperand(paddingInfo[0], paddingType, result.operands)) ||
+      parser.resolveOperand(paddingInfo, memRefType.getElementType(),
+                            result.operands) ||
       parser.addTypeToList(vectorType, result.types));
 }
 
-LogicalResult VectorTransferReadOp::verify() {
-  // Consistency of memref type in function type.
-  if (llvm::empty(getOperands())) {
-    return emitOpError(
-        "requires at least a memref operand followed by 'rank' indices");
-  }
-  if (!getMemRef()->getType().isa<MemRefType>()) {
-    return emitOpError("requires a memref as first operand");
-  }
-  // Consistency of vector type in function type.
-  if (!getResult()->getType().isa<VectorType>()) {
-    return emitOpError("should have a vector result type in function type: "
-                       "memref_type<...xelemental_type>, vector_type");
-  }
+static LogicalResult verify(VectorTransferReadOp op) {
   // Consistency of elemental types in memref and vector.
-  MemRefType memrefType = getMemRefType();
-  VectorType vectorType = getResultType();
+  MemRefType memrefType = op.getMemRefType();
+  VectorType vectorType = op.getVectorType();
   if (memrefType.getElementType() != vectorType.getElementType())
-    return emitOpError(
+    return op.emitOpError(
         "requires memref and vector types of the same elemental type");
-  // Consistency of number of input types.
-  auto optionalPaddingValue = getPaddingValue();
-  unsigned expectedNumOperands = Offsets::FirstIndexOffset +
-                                 memrefType.getRank() +
-                                 (optionalPaddingValue ? 1 : 0);
-  // Checks on the actual operands and their types.
-  if (getNumOperands() != expectedNumOperands) {
-    return emitOpError("expects ")
-           << expectedNumOperands << " operands (of which "
-           << memrefType.getRank() << " indices)";
-  }
-  // Consistency of padding value with vector type.
-  if (optionalPaddingValue) {
-    auto paddingValue = *optionalPaddingValue;
-    auto elementalType = paddingValue->getType();
-    if (!VectorType::isValidElementType(elementalType)) {
-      return emitOpError("requires valid padding vector elemental type");
-    }
-    if (elementalType != vectorType.getElementType()) {
-      return emitOpError(
-          "requires formal padding and vector of the same elemental type");
-    }
-  }
-  // Consistency of indices types.
-  unsigned numIndices = 0;
-  for (auto *idx : getIndices()) {
-    if (!idx->getType().isIndex()) {
-      return emitOpError(
-          "index to vector.transfer_read must have 'index' type");
-    }
-    ++numIndices;
-  }
-  if (numIndices != memrefType.getRank()) {
-    return emitOpError("requires at least a memref operand followed by ")
-           << memrefType.getRank() << " indices";
-  }
-
-  // Consistency of AffineMap attribute.
-  if (!getAttrOfType<AffineMapAttr>(getPermutationMapAttrName())) {
-    return emitOpError("requires an AffineMapAttr named 'permutation_map'");
-  }
-  auto permutationMap = getPermutationMap();
-  if (permutationMap.getNumSymbols() != 0) {
-    return emitOpError("requires a permutation_map without symbols");
-  }
-  if (permutationMap.getNumInputs() != memrefType.getRank()) {
-    return emitOpError("requires a permutation_map with input dims of the "
-                       "same rank as the memref type");
-  }
-  if (permutationMap.getNumResults() != vectorType.getRank()) {
-    return emitOpError("requires a permutation_map with result dims of the "
-                       "same rank as the vector type (")
-           << permutationMap.getNumResults() << " vs " << vectorType.getRank();
-  }
+  auto elementalType = op.padding()->getType();
+  if (!VectorType::isValidElementType(elementalType))
+    return op.emitOpError("requires valid padding vector elemental type");
+  if (elementalType != vectorType.getElementType())
+    return op.emitOpError(
+        "requires formal padding and vector of the same elemental type");
+  if (llvm::size(op.indices()) != memrefType.getRank())
+    return op.emitOpError("requires ") << memrefType.getRank() << " indices";
+  auto permutationMap = op.permutation_map();
+  if (permutationMap.getNumSymbols() != 0)
+    return op.emitOpError("requires permutation_map without symbols");
+  if (permutationMap.getNumInputs() != memrefType.getRank())
+    return op.emitOpError("requires a permutation_map with input dims of the "
+                          "same rank as the memref type");
+  if (permutationMap.getNumResults() != vectorType.getRank())
+    return op.emitOpError("requires a permutation_map with result dims of the "
+                          "same rank as the vector type");
   return verifyPermutationMap(permutationMap,
-                              [this](Twine t) { return emitOpError(t); });
+                              [&op](Twine t) { return op.emitOpError(t); });
 }
 
 //===----------------------------------------------------------------------===//
 // VectorTransferWriteOp
 //===----------------------------------------------------------------------===//
-void VectorTransferWriteOp::build(Builder *builder, OperationState &result,
-                                  Value *srcVector, Value *dstMemRef,
-                                  ArrayRef<Value *> dstIndices,
-                                  AffineMap permutationMap) {
-  result.addOperands({srcVector, dstMemRef});
-  result.addOperands(dstIndices);
-  result.addAttribute(getPermutationMapAttrName(),
-                      AffineMapAttr::get(permutationMap));
-}
-
-auto VectorTransferWriteOp::getIndices() -> operand_range {
-  auto begin = getOperation()->operand_begin() + Offsets::FirstIndexOffset;
-  auto end = begin + getMemRefType().getRank();
-  return {begin, end};
-}
-
-AffineMap VectorTransferWriteOp::getPermutationMap() {
-  return getAttrOfType<AffineMapAttr>(getPermutationMapAttrName()).getValue();
-}
-
-void VectorTransferWriteOp::print(OpAsmPrinter &p) {
-  p << getOperationName();
-  p << " " << *getVector();
-  p << ", " << *getMemRef();
+static void print(OpAsmPrinter &p, VectorTransferWriteOp op) {
+  p << op.getOperationName() << " " << *op.vector() << ", " << *op.memref();
   p << "[";
-  p.printOperands(getIndices());
+  p.printOperands(op.indices());
   p << "]";
-  p.printOptionalAttrDict(getAttrs());
+  p.printOptionalAttrDict(op.getAttrs());
   p << " : ";
-  p.printType(getVectorType());
+  p.printType(op.getVectorType());
   p << ", ";
-  p.printType(getMemRefType());
+  p.printType(op.getMemRefType());
 }
 
-ParseResult VectorTransferWriteOp::parse(OpAsmParser &parser,
-                                         OperationState &result) {
+ParseResult parseVectorTransferWriteOp(OpAsmParser &parser,
+                                       OperationState &result) {
+  llvm::SMLoc typesLoc;
   OpAsmParser::OperandType storeValueInfo;
-  OpAsmParser::OperandType memrefInfo;
+  OpAsmParser::OperandType memRefInfo;
   SmallVector<OpAsmParser::OperandType, 4> indexInfo;
   SmallVector<Type, 2> types;
-  auto indexType = parser.getBuilder().getIndexType();
   if (parser.parseOperand(storeValueInfo) || parser.parseComma() ||
-      parser.parseOperand(memrefInfo) ||
+      parser.parseOperand(memRefInfo) ||
       parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
       parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonTypeList(types))
+      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
     return failure();
-
   if (types.size() != 2)
-    return parser.emitError(parser.getNameLoc(), "expected 2 types");
-  VectorType vectorType = types[Offsets::VectorOffset].dyn_cast<VectorType>();
-  if (!vectorType)
-    return parser.emitError(parser.getNameLoc(), "vector type expected");
-  MemRefType memrefType = types[Offsets::MemRefOffset].dyn_cast<MemRefType>();
-  if (!memrefType)
-    return parser.emitError(parser.getNameLoc(), "memRef type expected");
-
+    return parser.emitError(typesLoc, "two types required");
+  auto indexType = parser.getBuilder().getIndexType();
+  Type vectorType = types[0], memRefType = types[1];
   return failure(
-      parser.resolveOperands(storeValueInfo, vectorType, result.operands) ||
-      parser.resolveOperands(memrefInfo, memrefType, result.operands) ||
+      parser.resolveOperand(storeValueInfo, vectorType, result.operands) ||
+      parser.resolveOperand(memRefInfo, memRefType, result.operands) ||
       parser.resolveOperands(indexInfo, indexType, result.operands));
 }
 
-LogicalResult VectorTransferWriteOp::verify() {
-  // Consistency of memref type in function type.
-  if (llvm::empty(getOperands())) {
-    return emitOpError(
-        "requires at least a memref operand followed by 'rank' indices");
-  }
-  if (!getMemRef()->getType().isa<MemRefType>()) {
-    return emitOpError("requires a memref first operand");
-  }
-  // Consistency of vector type in function type.
-  if (!getVector()->getType().isa<VectorType>()) {
-    return emitOpError("should have a vector input type in function type: "
-                       "(vector_type, memref_type [, elemental_type]) -> ()");
-  }
+static LogicalResult verify(VectorTransferWriteOp op) {
   // Consistency of elemental types in memref and vector.
-  MemRefType memrefType = getMemRefType();
-  VectorType vectorType = getVectorType();
+  MemRefType memrefType = op.getMemRefType();
+  VectorType vectorType = op.getVectorType();
   if (memrefType.getElementType() != vectorType.getElementType())
-    return emitOpError(
+    return op.emitOpError(
         "requires memref and vector types of the same elemental type");
-  // Consistency of number of input types.
-  unsigned expectedNumOperands =
-      Offsets::FirstIndexOffset + memrefType.getRank();
-  // Checks on the actual operands and their types.
-  if (getNumOperands() != expectedNumOperands) {
-    return emitOpError() << "expects " << expectedNumOperands
-                         << " operands (of which " << memrefType.getRank()
-                         << " indices)";
-  }
-  // Consistency of indices types.
-  unsigned numIndices = 0;
-  for (auto *idx : getIndices()) {
-    if (!idx->getType().isIndex()) {
-      return emitOpError(
-          "index to vector.transfer_write must have 'index' type");
-    }
-    numIndices++;
-  }
-  if (numIndices != memrefType.getRank()) {
-    return emitOpError("requires at least a memref operand followed by ")
-           << memrefType.getRank() << " indices";
-  }
+  if (llvm::size(op.indices()) != memrefType.getRank())
+    return op.emitOpError("requires ") << memrefType.getRank() << " indices";
 
   // Consistency of AffineMap attribute.
-  if (!getAttrOfType<AffineMapAttr>(getPermutationMapAttrName())) {
-    return emitOpError("requires an AffineMapAttr named 'permutation_map'");
-  }
-  auto permutationMap = getPermutationMap();
-  if (permutationMap.getNumSymbols() != 0) {
-    return emitOpError("requires a permutation_map without symbols");
-  }
-  if (permutationMap.getNumInputs() != memrefType.getRank()) {
-    return emitOpError("requires a permutation_map with input dims of the "
-                       "same rank as the memref type");
-  }
-  if (permutationMap.getNumResults() != vectorType.getRank()) {
-    return emitOpError("requires a permutation_map with result dims of the "
-                       "same rank as the vector type (")
-           << permutationMap.getNumResults() << " vs " << vectorType.getRank();
-  }
+  auto permutationMap = op.permutation_map();
+  if (permutationMap.getNumSymbols() != 0)
+    return op.emitOpError("requires a symbol-less permutation_map");
+  if (permutationMap.getNumInputs() != memrefType.getRank())
+    return op.emitOpError("requires a permutation_map with input dims of the "
+                          "same rank as the memref type: ")
+           << permutationMap.getNumInputs() << " vs " << memrefType;
+  if (permutationMap.getNumResults() != vectorType.getRank())
+    return op.emitOpError("requires a permutation_map with result dims of the "
+                          "same rank as the vector type.")
+           << permutationMap.getNumResults() << " vs " << vectorType;
   return verifyPermutationMap(permutationMap,
-                              [this](Twine t) { return emitOpError(t); });
+                              [&op](Twine t) { return op.emitOpError(t); });
 }
 
 //===----------------------------------------------------------------------===//
 // VectorTypeCastOp
 //===----------------------------------------------------------------------===//
-void VectorTypeCastOp::build(Builder *builder, OperationState &result,
-                             Value *srcVector, Type dstType) {
-  result.addOperands(srcVector);
-  result.addTypes(dstType);
-}
 
-ParseResult VectorTypeCastOp::parse(OpAsmParser &parser,
-                                    OperationState &result) {
-  OpAsmParser::OperandType operand;
-  Type srcType, dstType;
-  return failure(parser.parseOperand(operand) ||
-                 parser.parseOptionalAttrDict(result.attributes) ||
-                 parser.parseColonType(srcType) || parser.parseComma() ||
-                 parser.parseType(dstType) ||
-                 parser.addTypeToList(dstType, result.types) ||
-                 parser.resolveOperand(operand, srcType, result.operands));
+static MemRefType inferVectorTypeCastResultType(MemRefType t) {
+  return MemRefType::get({}, VectorType::get(t.getShape(), t.getElementType()));
 }
 
-void VectorTypeCastOp::print(OpAsmPrinter &p) {
-  p << getOperationName() << ' ' << *getOperand() << " : "
-    << getOperand()->getType() << ", " << getType();
+void VectorTypeCastOp::build(Builder *builder, OperationState &result,
+                             Value *source) {
+  result.addOperands(source);
+  result.addTypes(
+      inferVectorTypeCastResultType(source->getType().cast<MemRefType>()));
 }
 
-LogicalResult VectorTypeCastOp::verify() {
-  auto dstMemrefType = getType().dyn_cast<MemRefType>();
-  if (!dstMemrefType)
-    return emitOpError("expects target type to be a memref type");
-  auto dstVectorType = dstMemrefType.getElementType().dyn_cast<VectorType>();
-  if (!dstVectorType)
-    return emitOpError(
-        "expects vector as an element of the target memref type");
-  if (!dstMemrefType.hasStaticShape())
-    return emitOpError("does not support dynamic shapes");
-
-  if (!getOperand()->getType().isa<MemRefType>())
-    return emitOpError("expects source type to be a memref type");
+static void print(OpAsmPrinter &p, VectorTypeCastOp &op) {
+  auto type = op.getOperand()->getType().cast<MemRefType>();
+  p << op.getOperationName() << ' ' << *op.memref() << " : " << type << " to "
+    << inferVectorTypeCastResultType(type);
+}
 
+static LogicalResult verify(VectorTypeCastOp &op) {
+  auto resultType = inferVectorTypeCastResultType(op.getMemRefType());
+  if (op.getResultMemRefType() != resultType)
+    return op.emitOpError("expects result type to be: ") << resultType;
   return success();
 }
 
diff --git a/mlir/lib/Transforms/LowerVectorTransfers.cpp b/mlir/lib/Transforms/LowerVectorTransfers.cpp
index c517d74f221..57dd18dac0f 100644
--- a/mlir/lib/Transforms/LowerVectorTransfers.cpp
+++ b/mlir/lib/Transforms/LowerVectorTransfers.cpp
@@ -113,12 +113,6 @@ struct VectorTransferRewriter : public RewritePattern {
                            {}, 0);
   }
 
-  /// View of tmpMemRefType as one vector, used in vector load/store to tmp
-  /// buffer.
-  MemRefType vectorMemRefType(VectorTransferOpTy transfer) const {
-    return MemRefType::get({1}, transfer.getVectorType(), {}, 0);
-  }
-
   /// Performs the rewrite.
   PatternMatchResult matchAndRewrite(Operation *op,
                                      PatternRewriter &rewriter) const override;
@@ -139,7 +133,7 @@ void coalesceCopy(VectorTransferOpTy transfer,
   // the loop order for creating pointwise copies between remote and local
   // memories.
   int coalescedIdx = -1;
-  auto exprs = transfer.getPermutationMap().getResults();
+  auto exprs = transfer.permutation_map().getResults();
   for (auto en : llvm::enumerate(exprs)) {
     auto dim = en.value().template dyn_cast<AffineDimExpr>();
     if (!dim) {
@@ -170,7 +164,7 @@ llvm::SmallVector<edsc::ValueHandle, 8> clip(VectorTransferOpTy transfer,
   using edsc::intrinsics::select;
 
   IndexHandle zero(index_t(0)), one(index_t(1));
-  llvm::SmallVector<edsc::ValueHandle, 8> memRefAccess(transfer.getIndices());
+  llvm::SmallVector<edsc::ValueHandle, 8> memRefAccess(transfer.indices());
   llvm::SmallVector<edsc::ValueHandle, 8> clippedScalarAccessExprs(
       memRefAccess.size(), edsc::IndexHandle());
 
@@ -180,7 +174,7 @@ llvm::SmallVector<edsc::ValueHandle, 8> clip(VectorTransferOpTy transfer,
        ++memRefDim) {
     // Linear search on a small number of entries.
     int loopIndex = -1;
-    auto exprs = transfer.getPermutationMap().getResults();
+    auto exprs = transfer.permutation_map().getResults();
     for (auto en : llvm::enumerate(exprs)) {
       auto expr = en.value();
       auto dim = expr.template dyn_cast<AffineDimExpr>();
@@ -273,9 +267,9 @@ VectorTransferRewriter<VectorTransferReadOp>::matchAndRewrite(
 
   // 1. Setup all the captures.
   ScopedContext scope(rewriter, transfer.getLoc());
-  IndexedValue remote(transfer.getMemRef());
-  MemRefView view(transfer.getMemRef());
-  VectorView vectorView(transfer.getVector());
+  IndexedValue remote(transfer.memref());
+  MemRefView view(transfer.memref());
+  VectorView vectorView(transfer.vector());
   SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
   SmallVector<ValueHandle *, 8> pivs =
       makeIndexHandlePointers(MutableArrayRef<IndexHandle>(ivs));
@@ -291,12 +285,12 @@ VectorTransferRewriter<VectorTransferReadOp>::matchAndRewrite(
   // 2. Emit alloc-copy-load-dealloc.
   ValueHandle tmp = alloc(tmpMemRefType(transfer));
   IndexedValue local(tmp);
-  ValueHandle vec = vector_type_cast(tmp, vectorMemRefType(transfer));
+  ValueHandle vec = vector_type_cast(tmp);
   LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
     local(ivs) = remote(clip(transfer, view, ivs));
   });
-  ValueHandle vectorValue = std_load(vec, {constant_index(0)});
+  ValueHandle vectorValue = std_load(vec);
   (dealloc(tmp)); // vexing parse
 
   // 3. Propagate.
@@ -336,10 +330,10 @@ VectorTransferRewriter<VectorTransferWriteOp>::matchAndRewrite(
 
   // 1. Setup all the captures.
   ScopedContext scope(rewriter, transfer.getLoc());
-  IndexedValue remote(transfer.getMemRef());
-  MemRefView view(transfer.getMemRef());
-  ValueHandle vectorValue(transfer.getVector());
-  VectorView vectorView(transfer.getVector());
+  IndexedValue remote(transfer.memref());
+  MemRefView view(transfer.memref());
+  ValueHandle vectorValue(transfer.vector());
+  VectorView vectorView(transfer.vector());
   SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
   SmallVector<ValueHandle *, 8> pivs = makeIndexHandlePointers(ivs);
   coalesceCopy(transfer, &pivs, &vectorView);
@@ -354,8 +348,8 @@ VectorTransferRewriter<VectorTransferWriteOp>::matchAndRewrite(
   // 2. Emit alloc-store-copy-dealloc.
   ValueHandle tmp = alloc(tmpMemRefType(transfer));
   IndexedValue local(tmp);
-  ValueHandle vec = vector_type_cast(tmp, vectorMemRefType(transfer));
-  std_store(vectorValue, vec, {constant_index(0)});
+  ValueHandle vec = vector_type_cast(tmp);
+  std_store(vectorValue, vec);
   LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
     // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
     remote(clip(transfer, view, ivs)) = local(ivs);
diff --git a/mlir/lib/Transforms/MaterializeVectors.cpp b/mlir/lib/Transforms/MaterializeVectors.cpp
index a0b60dd3648..06016da5caa 100644
--- a/mlir/lib/Transforms/MaterializeVectors.cpp
+++ b/mlir/lib/Transforms/MaterializeVectors.cpp
@@ -465,7 +465,7 @@ static AffineMap projectedPermutationMap(VectorTransferOpTy transfer,
         ++dim;
       },
       superVectorType.getShape(), *optionalRatio);
-  auto permutationMap = transfer.getPermutationMap();
+  auto permutationMap = transfer.permutation_map();
   LLVM_DEBUG(permutationMap.print(dbgs() << "\npermutationMap: "));
   if (keep.empty()) {
     return permutationMap;
@@ -486,16 +486,16 @@ static Operation *instantiate(OpBuilder b, VectorTransferReadOp read,
                               ArrayRef<unsigned> hwVectorInstance,
                               DenseMap<Value *, Value *> *substitutionsMap) {
   SmallVector<Value *, 8> indices =
-      map(makePtrDynCaster<Value>(), read.getIndices());
+      map(makePtrDynCaster<Value>(), read.indices());
   auto affineIndices =
       reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices);
   auto map = projectedPermutationMap(read, hwVectorType);
   if (!map) {
     return nullptr;
   }
-  auto cloned = b.create<VectorTransferReadOp>(read.getLoc(), hwVectorType,
-                                               read.getMemRef(), affineIndices,
-                                               map, read.getPaddingValue());
+  auto cloned = b.create<VectorTransferReadOp>(
+      read.getLoc(), hwVectorType, read.memref(), affineIndices,
+      AffineMapAttr::get(map), read.padding());
   return cloned.getOperation();
 }
 
@@ -510,14 +510,14 @@ static Operation *instantiate(OpBuilder b, VectorTransferWriteOp write,
                               ArrayRef<unsigned> hwVectorInstance,
                               DenseMap<Value *, Value *> *substitutionsMap) {
   SmallVector<Value *, 8> indices =
-      map(makePtrDynCaster<Value>(), write.getIndices());
+      map(makePtrDynCaster<Value>(), write.indices());
   auto affineIndices =
       reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices);
   auto cloned = b.create<VectorTransferWriteOp>(
       write.getLoc(),
-      substitute(write.getVector(), hwVectorType, substitutionsMap),
-      write.getMemRef(), affineIndices,
-      projectedPermutationMap(write, hwVectorType));
+      substitute(write.vector(), hwVectorType, substitutionsMap),
+      write.memref(), affineIndices,
+      AffineMapAttr::get(projectedPermutationMap(write, hwVectorType)));
   return cloned.getOperation();
 }
 
diff --git a/mlir/lib/Transforms/Vectorize.cpp b/mlir/lib/Transforms/Vectorize.cpp
index a1e87568745..b3eea35a55f 100644
--- a/mlir/lib/Transforms/Vectorize.cpp
+++ b/mlir/lib/Transforms/Vectorize.cpp
@@ -35,6 +35,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/Functional.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/DenseMap.h"
@@ -718,6 +719,8 @@ struct VectorizationState {
   // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
   // set.
   void registerTerminal(Operation *op);
+  // Folder used to factor out constant creation.
+  OperationFolder *folder;
 
 private:
   void registerReplacement(Value *key, Value *value);
@@ -832,7 +835,11 @@ static LogicalResult vectorizeRootOrTerminal(Value *iv,
     LLVM_DEBUG(permutationMap.print(dbgs()));
     auto transfer = b.create<vector::VectorTransferReadOp>(
         opInst->getLoc(), vectorType, memoryOp.getMemRef(),
-        map(makePtrDynCaster<Value>(), indices), permutationMap);
+        map(makePtrDynCaster<Value>(), indices),
+        AffineMapAttr::get(permutationMap),
+        // TODO(b/144455320) add a proper padding value, not just 0.0 : f32
+        state->folder->create<ConstantFloatOp>(
+            b, opInst->getLoc(), llvm::APFloat(0.0f), b.getF32Type()));
     state->registerReplacement(opInst, transfer.getOperation());
   } else {
     state->registerTerminal(opInst);
@@ -1058,7 +1065,8 @@ static Operation *vectorizeOneOperation(Operation *opInst,
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
     LLVM_DEBUG(permutationMap.print(dbgs()));
     auto transfer = b.create<vector::VectorTransferWriteOp>(
-        opInst->getLoc(), vectorValue, memRef, indices, permutationMap);
+        opInst->getLoc(), vectorValue, memRef, indices,
+        AffineMapAttr::get(permutationMap));
     auto *res = transfer.getOperation();
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
     // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
@@ -1152,8 +1160,10 @@ static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
 static LogicalResult vectorizeRootMatch(NestedMatch m,
                                         VectorizationStrategy *strategy) {
   auto loop = cast<AffineForOp>(m.getMatchedOperation());
+  OperationFolder folder(loop.getContext());
   VectorizationState state;
   state.strategy = strategy;
+  state.folder = &folder;
 
   // Since patterns are recursive, they can very well intersect.
   // Since we do not want a fully greedy strategy in general, we decouple
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index d4bbc051a8d..ff07f52cf23 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -48,22 +48,18 @@ func @extract_element_from_vec_3d(%arg0: vector<4x3x16xf32>) -> f32 {
 //       CHECK:   llvm.extractelement %{{.*}}, %{{.*}} : !llvm<"<16 x float>">
 //       CHECK:   llvm.return %{{.*}} : !llvm.float
 
-func @vector_type_cast(%arg0: memref<8x8x8xf32>) -> memref<1xvector<8x8x8xf32>> {
-  %0 = vector.type_cast %arg0: memref<8x8x8xf32>, memref<1xvector<8x8x8xf32>>
-  return %0 : memref<1xvector<8x8x8xf32>>
+func @vector_type_cast(%arg0: memref<8x8x8xf32>) -> memref<vector<8x8x8xf32>> {
+  %0 = vector.type_cast %arg0: memref<8x8x8xf32> to memref<vector<8x8x8xf32>>
+  return %0 : memref<vector<8x8x8xf32>>
 }
 // CHECK-LABEL: vector_type_cast
-//       CHECK:   llvm.mlir.undef : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.mlir.undef : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
 //       CHECK:   %[[allocated:.*]] = llvm.extractvalue {{.*}}[0 : index] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
 //       CHECK:   %[[allocatedBit:.*]] = llvm.bitcast %[[allocated]] : !llvm<"float*"> to !llvm<"[8 x [8 x <8 x float>]]*">
-//       CHECK:   llvm.insertvalue %[[allocatedBit]], {{.*}}[0 : index] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.insertvalue %[[allocatedBit]], {{.*}}[0 : index] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
 //       CHECK:   %[[aligned:.*]] = llvm.extractvalue {{.*}}[1 : index] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
 //       CHECK:   %[[alignedBit:.*]] = llvm.bitcast %[[aligned]] : !llvm<"float*"> to !llvm<"[8 x [8 x <8 x float>]]*">
-//       CHECK:   llvm.insertvalue %[[alignedBit]], {{.*}}[1 : index] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.insertvalue %[[alignedBit]], {{.*}}[1 : index] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
 //       CHECK:   llvm.mlir.constant(0 : index
-//       CHECK:   llvm.insertvalue {{.*}}[2 : index] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64, [1 x i64], [1 x i64] }">
-//       CHECK:   llvm.mlir.constant(1 : index
-//       CHECK:   llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64, [1 x i64], [1 x i64] }">
-//       CHECK:   llvm.mlir.constant(1 : index
-//       CHECK:   llvm.insertvalue {{.*}}[4, 0] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.insertvalue {{.*}}[2 : index] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
 
diff --git a/mlir/test/Dialect/VectorOps/invalid.mlir b/mlir/test/Dialect/VectorOps/invalid.mlir
index ca339e7362a..2db4cf53384 100644
--- a/mlir/test/Dialect/VectorOps/invalid.mlir
+++ b/mlir/test/Dialect/VectorOps/invalid.mlir
@@ -96,3 +96,146 @@ func @outerproduct_operand_3_result_type_generic(%arg0: vector<4xf32>, %arg1: ve
   // expected-error@+1 {{expected operand #3 of same type as result type}}
   %1 = "vector.outerproduct" (%arg0, %arg1, %arg2) : (vector<4xf32>, vector<8xf32>, vector<4x16xf32>) -> (vector<4x8xf32>)
 }
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{two types required}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst { permutation_map = ()->(0) } : memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires 2 indices}}
+  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst { permutation_map = ()->(0) } : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires attribute 'permutation_map'}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {perm = (d0)->(d0)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a permutation_map with input dims of the same rank as the memref type}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0)->(d0)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a permutation_map with result dims of the same rank as the vector type}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0, d1)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0 + d1)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0 + 1)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
+  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst {permutation_map = (d0, d1, d2)->(d0, d0)} : memref<?x?x?xf32>, vector<3x7xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{expected 5 operand types but had 4}}
+  %0 = "vector.transfer_write"(%cst, %arg0, %c3, %c3, %c3) {permutation_map = ()->(0)} : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires 2 indices}}
+  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = ()->(0)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires attribute 'permutation_map'}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {perm = (d0)->(d0)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a permutation_map with input dims of the same rank as the memref type}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0)->(d0)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a permutation_map with result dims of the same rank as the vector type}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0, d1)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + d1)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + 1)} : vector<128xf32>, memref<?x?xf32>
+}
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<3 x 7 x f32>
+  // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
+  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = (d0, d1, d2)->(d0, d0)} : vector<3x7xf32>, memref<?x?x?xf32>
+}
diff --git a/mlir/test/Dialect/VectorOps/ops.mlir b/mlir/test/Dialect/VectorOps/ops.mlir
index 067345af0d9..77d40f5e84d 100644
--- a/mlir/test/Dialect/VectorOps/ops.mlir
+++ b/mlir/test/Dialect/VectorOps/ops.mlir
@@ -1,5 +1,27 @@
 // RUN: mlir-opt %s | mlir-opt | FileCheck %s
 
+// CHECK-LABEL: func @vector_transfer_ops(
+func @vector_transfer_ops(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  %f0 = constant 0.0 : f32
+  //
+  // CHECK: %0 = vector.transfer_read
+  %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = (d0, d1)->(d0)} : memref<?x?xf32>, vector<128xf32>
+  // CHECK: %1 = vector.transfer_read
+  %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = (d0, d1)->(d1, d0)} : memref<?x?xf32>, vector<3x7xf32>
+  // CHECK: vector.transfer_read
+  %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0)} : memref<?x?xf32>,  vector<128xf32>
+  // CHECK: vector.transfer_read
+  %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d1)} : memref<?x?xf32>,  vector<128xf32>
+  //
+  // CHECK: vector.transfer_write
+  vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0)} : vector<128xf32>, memref<?x?xf32>
+  // CHECK: vector.transfer_write
+  vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d1, d0)} : vector<3x7xf32>, memref<?x?xf32>
+  return
+}
+
 // CHECK-LABEL: extractelement
 func @extractelement(%arg0: vector<4x8x16xf32>) -> (vector<8x16xf32>, vector<16xf32>, f32) {
   //      CHECK: vector.extractelement {{.*}}[3 : i32] : vector<4x8x16xf32>
diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir
index 96df40202a3..417c8723aa4 100644
--- a/mlir/test/IR/core-ops.mlir
+++ b/mlir/test/IR/core-ops.mlir
@@ -7,9 +7,6 @@
 // CHECK: #map0 = (d0) -> (d0 + 1)
 
 // CHECK: #map1 = ()[s0] -> (s0 + 1)
-// CHECK-DAG: #[[map_proj_d0d1_d0:map[0-9]+]] = (d0, d1) -> (d0)
-// CHECK-DAG: #[[map_proj_d0d1_d1:map[0-9]+]] = (d0, d1) -> (d1)
-// CHECK-DAG: #[[map_proj_d0d1_d1d0:map[0-9]+]] = (d0, d1) -> (d1, d0)
 
 // CHECK-DAG: #[[VIEW_MAP1:map[0-9]+]] = (d0, d1) -> (d0 * 4 + d1)
 // CHECK-DAG: #[[VIEW_MAP2:map[0-9]+]] = (d0, d1)[s0, s1] -> (d0 * s1 + d1 + s0)
@@ -564,26 +561,6 @@ func @test_splat_op(%s : f32) {
   return
 }
 
-// CHECK-LABEL: func @test_vector.transfer_ops(%arg0
-func @test_vector.transfer_ops(%arg0: memref<?x?xf32>) {
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // CHECK: %0 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = #[[map_proj_d0d1_d0]]} : memref<?x?xf32>, vector<128xf32>
-  %0 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0)} : memref<?x?xf32>, vector<128xf32>
-  // CHECK: %1 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = #[[map_proj_d0d1_d1d0]]} : memref<?x?xf32>, vector<3x7xf32>
-  %1 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d1, d0)} : memref<?x?xf32>, vector<3x7xf32>
-  // CHECK: %2 = vector.transfer_read %arg0[%c3, %c3], (%cst) {permutation_map = #[[map_proj_d0d1_d0]]} : memref<?x?xf32>,  vector<128xf32>
-  %2 = vector.transfer_read %arg0[%c3, %c3], (%cst) {permutation_map = (d0, d1)->(d0)} : memref<?x?xf32>,  vector<128xf32>
-  // CHECK: %3 = vector.transfer_read %arg0[%c3, %c3], (%cst) {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>,  vector<128xf32>
-  %3 = vector.transfer_read %arg0[%c3, %c3], (%cst) {permutation_map = (d0, d1)->(d1)} : memref<?x?xf32>,  vector<128xf32>
-  //
-  // CHECK: vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = #[[map_proj_d0d1_d0]]} : vector<128xf32>, memref<?x?xf32>
-  vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0)} : vector<128xf32>, memref<?x?xf32>
-  // CHECK: vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = #[[map_proj_d0d1_d1d0]]} : vector<3x7xf32>, memref<?x?xf32>
-  vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d1, d0)} : vector<3x7xf32>, memref<?x?xf32>
-  return
-}
-
 // CHECK-LABEL: func @tensor_load_store
 func @tensor_load_store(%0 : memref<4x4xi32>) {
   // CHECK: %[[TENSOR:.*]] = tensor_load %[[MEMREF:.*]] : memref<4x4xi32>
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
index 9c1807807c3..74dd4129422 100644
--- a/mlir/test/IR/invalid-ops.mlir
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -297,185 +297,6 @@ func @func_with_ops(i1, tensor<42xi32>, tensor<?xi32>) {
 
 // -----
 
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{expected 2 types}}
-  %0 = vector.transfer_read %arg0[%c3, %c3] : memref<?x?xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{expected 2 indices to the memref}}
-  %0 = vector.transfer_read %arg0[%c3, %c3, %c3] : memref<?x?xf32>, vector<128xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires an AffineMapAttr named 'permutation_map'}}
-  %0 = vector.transfer_read %arg0[%c3, %c3] : memref<?x?xf32>, vector<128xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires an AffineMapAttr named 'permutation_map'}}
-  %0 = vector.transfer_read %arg0[%c3, %c3] {perm = (d0)->(d0)} : memref<?x?xf32>, vector<128xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires a permutation_map with input dims of the same rank as the memref type}}
-  %0 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = (d0)->(d0)} : memref<?x?xf32>, vector<128xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires a permutation_map with result dims of the same rank as the vector type}}
-  %0 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0, d1)} : memref<?x?xf32>, vector<128xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  %0 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + d1)} : memref<?x?xf32>, vector<128xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  %0 = vector.transfer_read %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + 1)} : memref<?x?xf32>, vector<128xf32>
-}
-
-// -----
-
-func @test_vector.transfer_read(memref<?x?x?xf32>) {
-^bb0(%arg0: memref<?x?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
-  %0 = vector.transfer_read %arg0[%c3, %c3, %c3] {permutation_map = (d0, d1, d2)->(d0, d0)} : memref<?x?x?xf32>, vector<3x7xf32>
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{expected 5 operand types but had 4}}
-  %0 = "vector.transfer_write"(%cst, %arg0, %c3, %c3, %c3) : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{expects 4 operands (of which 2 indices)}}
-  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] : vector<128xf32>, memref<?x?xf32>
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires an AffineMapAttr named 'permutation_map'}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] : vector<128xf32>, memref<?x?xf32>
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires an AffineMapAttr named 'permutation_map'}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {perm = (d0)->(d0)} : vector<128xf32>, memref<?x?xf32>
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires a permutation_map with input dims of the same rank as the memref type}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0)->(d0)} : vector<128xf32>, memref<?x?xf32>
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires a permutation_map with result dims of the same rank as the vector type}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0, d1)} : vector<128xf32>, memref<?x?xf32>
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + d1)} : vector<128xf32>, memref<?x?xf32>
-}
-
-// -----
-
-func @test_vector.transfer_write(memref<?x?xf32>) {
-^bb0(%arg0: memref<?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + 1)} : vector<128xf32>, memref<?x?xf32>
-}
-// -----
-
-func @test_vector.transfer_write(memref<?x?x?xf32>) {
-^bb0(%arg0: memref<?x?x?xf32>):
-  %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<3 x 7 x f32>
-  // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
-  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = (d0, d1, d2)->(d0, d0)} : vector<3x7xf32>, memref<?x?x?xf32>
-}
-
-// -----
-
 func @invalid_select_shape(%cond : i1, %idx : () -> ()) {
   // expected-error@+1 {{expected type with valid i1 shape}}
   %sel = select %cond, %idx, %idx : () -> ()
diff --git a/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir b/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir
index 31f8bf60b07..f9ca0d0da86 100644
--- a/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir
+++ b/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir
@@ -5,16 +5,17 @@
 
 // CHECK-LABEL: func @materialize_read_1d() {
 func @materialize_read_1d() {
+  %f0 = constant 0.0: f32
   %A = alloc () : memref<7x42xf32>
   affine.for %i0 = 0 to 7 step 4 {
     affine.for %i1 = 0 to 42 step 4 {
-      %f1 = vector.transfer_read %A[%i0, %i1] {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
       %ip1 = affine.apply (d0) -> (d0 + 1) (%i1)
-      %f2 = vector.transfer_read %A[%i0, %ip1] {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
       %ip2 = affine.apply (d0) -> (d0 + 2) (%i1)
-      %f3 = vector.transfer_read %A[%i0, %ip2] {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
       %ip3 = affine.apply (d0) -> (d0 + 3) (%i1)
-      %f4 = vector.transfer_read %A[%i0, %ip3] {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
       // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
       // CHECK: {{.*}} = select
       // CHECK: %[[FILTERED1:.*]] = select
@@ -28,15 +29,16 @@ func @materialize_read_1d() {
 
 // CHECK-LABEL: func @materialize_read_1d_partially_specialized
 func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) {
+  %f0 = constant 0.0: f32
   %A = alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32>
   affine.for %i0 = 0 to 7 {
     affine.for %i1 = 0 to %dyn1 {
       affine.for %i2 = 0 to %dyn2 {
         affine.for %i3 = 0 to 42 step 2 {
           affine.for %i4 = 0 to %dyn4 {
-            %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4] {permutation_map = (d0, d1, d2, d3, d4) -> (d3)} : memref<7x?x?x42x?xf32>, vector<4xf32>
+            %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = (d0, d1, d2, d3, d4) -> (d3)} : memref<7x?x?x42x?xf32>, vector<4xf32>
             %i3p1 = affine.apply (d0) -> (d0 + 1) (%i3)
-            %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4] {permutation_map = (d0, d1, d2, d3, d4) -> (d3)} : memref<7x?x?x42x?xf32>, vector<4xf32>
+            %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = (d0, d1, d2, d3, d4) -> (d3)} : memref<7x?x?x42x?xf32>, vector<4xf32>
           }
         }
       }
@@ -53,6 +55,7 @@ func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %d
 
 // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
+  %f0 = constant 0.0: f32
   // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
   // CHECK-DAG:  %[[C1:.*]] = constant 1 : index
   // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
@@ -68,7 +71,7 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
   // CHECK-NEXT:          %[[D2:.*]] = dim %{{.*}}, 2 : memref<?x?x?x?xf32>
   // CHECK-NEXT:          %[[D3:.*]] = dim %{{.*}}, 3 : memref<?x?x?x?xf32>
   //      CHECK:          %[[ALLOC:.*]] = alloc() : memref<5x4x3xf32>
-  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast %[[ALLOC]] : memref<5x4x3xf32>, memref<1xvector<5x4x3xf32>>
+  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast %[[ALLOC]] : memref<5x4x3xf32>
   // CHECK-NEXT:          loop.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
   // CHECK-NEXT:            loop.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
   // CHECK-NEXT:              loop.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
@@ -103,7 +106,7 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
   // CHECK-NEXT:              }
   // CHECK-NEXT:            }
   // CHECK-NEXT:          }
-  //      CHECK:          {{.*}} = load %[[VECTOR_VIEW]][{{.*}}] : memref<1xvector<5x4x3xf32>>
+  //      CHECK:          {{.*}} = load %[[VECTOR_VIEW]][] : memref<vector<5x4x3xf32>>
   // CHECK-NEXT:          dealloc %[[ALLOC]] : memref<5x4x3xf32>
   // CHECK-NEXT:        }
   // CHECK-NEXT:      }
@@ -120,7 +123,7 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
     affine.for %i1 = 0 to %N {
       affine.for %i2 = 0 to %O {
         affine.for %i3 = 0 to %P step 5 {
-          %f = vector.transfer_read %A[%i0, %i1, %i2, %i3] {permutation_map = (d0, d1, d2, d3) -> (d3, 0, d0)} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
+          %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = (d0, d1, d2, d3) -> (d3, 0, d0)} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
         }
       }
     }
@@ -146,8 +149,8 @@ func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
   // CHECK-NEXT:          %[[D2:.*]] = dim %{{.*}}, 2 : memref<?x?x?x?xf32>
   // CHECK-NEXT:          %[[D3:.*]] = dim %{{.*}}, 3 : memref<?x?x?x?xf32>
   // CHECK:               %[[ALLOC:.*]] = alloc() : memref<5x4x3xf32>
-  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast {{.*}} : memref<5x4x3xf32>, memref<1xvector<5x4x3xf32>>
-  //      CHECK:          store %{{.*}}, {{.*}} : memref<1xvector<5x4x3xf32>>
+  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast {{.*}} : memref<5x4x3xf32>
+  //      CHECK:          store %{{.*}}, {{.*}} : memref<vector<5x4x3xf32>>
   // CHECK-NEXT:          loop.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
   // CHECK-NEXT:            loop.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
   // CHECK-NEXT:              loop.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
diff --git a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir b/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
index afab2303c08..83f783c3aef 100644
--- a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
@@ -13,6 +13,7 @@
 // Maps introduced to vectorize fastest varying memory index.
 // CHECK-LABEL: func @vec1d_1
 func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -25,7 +26,7 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK: for {{.*}} step 128
 // CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
 // CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
-// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
      %a0 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
    }
@@ -34,6 +35,7 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec1d_2
 func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -44,7 +46,7 @@ func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    %cst0 = constant 0 : index
 //
 // CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
-// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i3 = 0 to %M { // vectorized
      %a3 = affine.load %A[%cst0, %i3] : memref<?x?xf32>
    }
@@ -53,6 +55,7 @@ func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec1d_3
 func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %arg0, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %arg0, 1 : memref<?x?xf32>
@@ -66,7 +69,7 @@ func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-NEXT:   for [[IV9:%[arg0-9]*]] = 0 to [[ARG_N]] {
 // CHECK-NEXT:   %[[APP9_0:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
 // CHECK-NEXT:   %[[APP9_1:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
-// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]] {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i8 = 0 to %M { // vectorized
      affine.for %i9 = 0 to %N {
        %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
@@ -100,8 +103,8 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
   }
   affine.for %i4 = 0 to %M {
     affine.for %i5 = 0 to %N {
-      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}] {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
-      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}] {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
       // CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<128xf32>
       // CHECK: [[SPLAT1:%.*]] = constant dense<1.000000e+00> : vector<128xf32>
       // CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<128xf32>
@@ -165,6 +168,7 @@ func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec_rejected_3
 func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -176,7 +180,7 @@ func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 //
 // CHECK:for [[IV4:%[arg0-9]+]] = 0 to [[ARG_M]] step 128 {
 // CHECK-NEXT:   for [[IV5:%[arg0-9]*]] = 0 to [[ARG_N]] {
-// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i4 = 0 to %M { // vectorized
      affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1
        %a5 = affine.load %A[%i5, %i4] : memref<?x?xf32>
@@ -273,6 +277,7 @@ func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec_rejected_8
 func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -286,7 +291,7 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
 // CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
 // CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
-// CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
      affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
        %a18 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
@@ -297,6 +302,7 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec_rejected_9
 func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -310,7 +316,7 @@ func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
 // CHECK:      %{{.*}} = affine.apply #map0(%{{.*}})
 // CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}})
-// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
      affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
        %a18 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
diff --git a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
index 6526d6b757f..a7553092505 100644
--- a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
@@ -69,8 +69,8 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
   }
   affine.for %i4 = 0 to %M {
     affine.for %i5 = 0 to %N {
-      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}] {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
-      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}] {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
+      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
+      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
       // CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
       // CHECK: [[SPLAT1:%.*]] = constant dense<1.000000e+00> : vector<32x256xf32>
       // CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
@@ -120,10 +120,10 @@ func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: me
   //      VECT:  affine.for %[[I2:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[M]]) step 4 {
   // VECT-NEXT:    affine.for %[[I3:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[N]]) step 8 {
   // VECT-NEXT:      affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[map_id1]](%[[K]]) {
-  // VECT-NEXT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]] {permutation_map = #[[map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
-  // VECT-NEXT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]] {permutation_map = #[[map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
+  // VECT-NEXT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
+  // VECT-NEXT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>
-  // VECT-NEXT:        %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]] {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<4x8xf32>
+  // VECT-NEXT:        %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[E:.*]] = addf %[[D]], %[[C]] : vector<4x8xf32>
   // VECT-NEXT:        vector.transfer_write %[[E]], %{{.*}}[%[[I2]], %[[I3]]] {permutation_map = #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>
   affine.for %i2 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
diff --git a/mlir/test/Transforms/Vectorize/vectorize_3d.mlir b/mlir/test/Transforms/Vectorize/vectorize_3d.mlir
index 797e58e0d40..df60806155a 100644
--- a/mlir/test/Transforms/Vectorize/vectorize_3d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_3d.mlir
@@ -12,7 +12,7 @@ func @vec3d(%A : memref<?x?x?xf32>) {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 32 {
    // CHECK:       affine.for %{{.*}} = 0 to %{{.*}} step 64 {
    // CHECK:         affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-   // CHECK:           %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d0d1d2]]} : memref<?x?x?xf32>, vector<32x64x256xf32>
+   // CHECK:           %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d0d1d2]]} : memref<?x?x?xf32>, vector<32x64x256xf32>
    affine.for %t0 = 0 to %0 {
      affine.for %t1 = 0 to %0 {
        affine.for %i0 = 0 to %0 {
diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir
index ded8dfad815..e398144a222 100644
--- a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir
@@ -10,7 +10,7 @@ func @vec2d(%A : memref<?x?x?xf32>) {
    // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
    // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256
-   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d0d2]]} : memref<?x?x?xf32>,  vector<32x256xf32>
+   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d0d2]]} : memref<?x?x?xf32>,  vector<32x256xf32>
    affine.for %i0 = 0 to %M {
      affine.for %i1 = 0 to %N {
        affine.for %i2 = 0 to %P {
diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir
index 36b1a4d4044..d2de5f8d159 100644
--- a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir
@@ -22,7 +22,7 @@ func @vec2d(%A : memref<?x?x?xf32>) {
    // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
    // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
    affine.for %i3 = 0 to %M {
      affine.for %i4 = 0 to %N {
        affine.for %i5 = 0 to %P {
@@ -40,12 +40,12 @@ func @vec2d_imperfectly_nested(%A : memref<?x?x?xf32>) {
    // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
    // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
    // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256 {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
    affine.for %i0 = 0 to %0 {
      affine.for %i1 = 0 to %1 {
        affine.for %i2 = 0 to %2 {
diff --git a/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir
index 4f61a26b4a8..765cd07ce7d 100644
--- a/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir
@@ -22,7 +22,7 @@ func @vec2d(%A : memref<?x?x?xf32>) {
    // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
    // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256
-   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
    affine.for %i3 = 0 to %M {
      affine.for %i4 = 0 to %N {
        affine.for %i5 = 0 to %P {
@@ -40,12 +40,12 @@ func @vec2d_imperfectly_nested(%A : memref<?x?x?xf32>) {
    // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
    // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256 {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
-   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
    // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
    affine.for %i0 = 0 to %0 {
      affine.for %i1 = 0 to %1 {
        affine.for %i2 = 0 to %2 {