//===- MaterializeVectors.cpp - MaterializeVectors Pass Impl --------------===// // // Copyright 2019 The MLIR Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= // // This file implements target-dependent materialization of super-vectors to // vectors of the proper size for the hardware. // //===----------------------------------------------------------------------===// #include "mlir/AffineOps/AffineOps.h" #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/Dominance.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/NestedMatcher.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Analysis/Utils.h" #include "mlir/Analysis/VectorAnalysis.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Location.h" #include "mlir/IR/OperationSupport.h" #include "mlir/IR/Types.h" #include "mlir/Pass/Pass.h" #include "mlir/StandardOps/Ops.h" #include "mlir/Support/Functional.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/Passes.h" #include "mlir/VectorOps/VectorOps.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" /// /// Implements target-dependent materialization of virtual super-vectors to /// vectors of the proper size for the hardware. /// /// While the physical vector size is target-dependent, the pass is written in /// a target-independent way: the target vector size is specified as a parameter /// to the pass. This pass is thus a partial lowering that opens the "greybox" /// that is the super-vector abstraction. In particular, this pass can turn the /// vector.transfer_read and vector.transfer_write ops in either: /// 1. a loop nest with either scalar and vector load/store operations; or /// 2. a loop-nest with DmaStartOp / DmaWaitOp; or /// 3. a pre-existing blackbox library call that can be written manually or /// synthesized using search and superoptimization. /// An important feature that either of these 3 target lowering abstractions /// must handle is the handling of "non-effecting" padding with the proper /// neutral element in order to guarantee that all "partial tiles" are actually /// "full tiles" in practice. /// /// In particular this pass is a MLIR-MLIR rewriting and does not concern itself /// with target-specific instruction-selection and register allocation. These /// will happen downstream in LLVM. /// /// In this sense, despite performing lowering to a target-dependent size, this /// pass is still target-agnostic. /// /// Implementation details /// ====================== /// The current decisions made by the super-vectorization pass guarantee that /// use-def chains do not escape an enclosing vectorized AffineForOp. In other /// words, this pass operates on a scoped program slice. Furthermore, since we /// do not vectorize in the presence of conditionals for now, sliced chains are /// guaranteed not to escape the innermost scope, which has to be either the top /// Function scope or the innermost loop scope, by construction. As a /// consequence, the implementation just starts from vector.transfer_write /// operations and builds the slice scoped the innermost loop enclosing the /// current vector.transfer_write. These assumptions and the implementation /// details are subject to revision in the future. /// /// Example /// ======== /// In the following, the single vector.transfer_write op operates on a /// vector<4x4x4xf32>. Let's assume the HW supports vector<4x4xf32>. /// Materialization is achieved by instantiating each occurrence of the leading /// dimension of vector<4x4x4xf32> into a vector<4x4xf32>. /// The program transformation that implements this instantiation is a /// multi-loop unroll-and-jam (it can be partial or full depending on the ratio /// of super-vector shape to HW-vector shape). /// /// As a simple case, the following: /// /// ```mlir /// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) { /// %A = alloc (%M, %N, %O, %P) : memref /// %f1 = constant dense, 1.000000e+00> : /// vector<4x4x4xf32> affine.for %i0 = 0 to %M step 4 { /// affine.for %i1 = 0 to %N step 4 { /// affine.for %i2 = 0 to %O { /// affine.for %i3 = 0 to %P step 4 { /// vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] /// {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : /// vector<4x4x4xf32>, memref /// }}}} /// return /// } /// ``` /// /// is instantiated by unroll-and-jam (just unroll in this case) into: /// /// ```mlir /// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) { /// %A = alloc (%M, %N, %O, %P) : memref /// %f1 = constant dense, 1.000000e+00> : vector<4x4x4xf32> /// affine.for %i0 = 0 to %arg0 step 4 { /// affine.for %i1 = 0 to %arg1 step 4 { /// affine.for %i2 = 0 to %arg2 { /// affine.for %i3 = 0 to %arg3 step 4 { /// vector.transfer_write f1, %0[%i0, %i1, %i2, %i3] /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : /// vector<4x4xf32>, memref /// %i3p1 = affine.apply (d0) -> (d0 + 1)(%i3) /// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p1] /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : /// vector<4x4xf32>, memref /// %i3p2 = affine.apply (d0) -> (d0 + 2)(%i3) /// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p2] /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : /// vector<4x4xf32>, memref /// %i3p3 = affine.apply (d0) -> (d0 + 3)(%i3) /// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p3] /// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : /// vector<4x4xf32>, memref /// }}}} /// return /// } /// ``` using llvm::dbgs; using llvm::DenseSet; using llvm::SetVector; using namespace mlir; using functional::makePtrDynCaster; using functional::map; static llvm::cl::list clVectorSize("vector-size", llvm::cl::desc("Specify the HW vector size for vectorization"), llvm::cl::ZeroOrMore); #define DEBUG_TYPE "materialize-vect" namespace { struct MaterializationState { /// In practice, the determination of the HW-specific vector type to use when /// lowering a super-vector type must be based on the elemental type. The /// elemental type must be retrieved from the super-vector type. In the future /// information about hardware vector type for a particular elemental type /// will be part of the contract between MLIR and the backend. /// /// For example, 8xf32 has the same size as 16xf16 but the targeted HW itself /// may exhibit the following property: /// 1. have a special unit for a 128xf16 datapath; /// 2. no F16 FPU support on the regular 8xf32/16xf16 vector datapath. /// /// For now, we just assume hwVectorSize has the proper information regardless /// of the type and we assert everything is f32. /// TODO(ntv): relax the assumptions on admissible element type once a /// contract exists. MaterializationState(SmallVector sizes) : hwVectorSize(sizes) {} SmallVector hwVectorSize; VectorType superVectorType; VectorType hwVectorType; SmallVector hwVectorInstance; DenseMap *substitutionsMap; }; /// Base state for the vector materialization pass. /// Command line arguments are preempted by non-empty pass arguments. struct MaterializeVectorsPass : public FunctionPass { MaterializeVectorsPass() : hwVectorSize(clVectorSize.begin(), clVectorSize.end()) {} MaterializeVectorsPass(ArrayRef hwVectorSize) : MaterializeVectorsPass() { if (!hwVectorSize.empty()) this->hwVectorSize.assign(hwVectorSize.begin(), hwVectorSize.end()); } SmallVector hwVectorSize; void runOnFunction() override; }; } // end anonymous namespace /// Given a shape with sizes greater than 0 along all dimensions, /// returns the distance, in number of elements, between a slice in a dimension /// and the next slice in the same dimension. /// e.g. shape[3, 4, 5] -> strides[20, 5, 1] static SmallVector makeStrides(ArrayRef shape) { SmallVector tmp; tmp.reserve(shape.size()); unsigned running = 1; for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) { assert(*rit > 0 && "size must be greater than 0 along all dimensions of " "shape"); tmp.push_back(running); running *= *rit; } return SmallVector(tmp.rbegin(), tmp.rend()); } /// Given a shape with sizes greater than 0 along all dimensions, returns the /// delinearized components of linearIndex along shape. static SmallVector delinearize(unsigned linearIndex, ArrayRef shape) { SmallVector res; res.reserve(shape.size()); auto strides = makeStrides(shape); for (unsigned idx = 0; idx < strides.size(); ++idx) { assert(strides[idx] > 0); auto val = linearIndex / strides[idx]; res.push_back(val); assert(val < shape[idx] && "delinearization is out of bounds"); linearIndex %= strides[idx]; } // Sanity check. assert(linearIndex == 0 && "linear index constructed from shape must " "have 0 remainder after delinearization"); return res; } static Operation *instantiate(OpBuilder b, Operation *opInst, VectorType hwVectorType, DenseMap *substitutionsMap); /// Not all Values belong to a program slice scoped within the immediately /// enclosing loop. /// One simple example is constants defined outside the innermost loop scope. /// For such cases the substitutionsMap has no entry and we allow an additional /// insertion. /// For now, this is limited to ConstantOp because we do not vectorize loop /// indices and will need to be extended in the future. /// /// If substitution fails, returns nullptr. static Value *substitute(Value *v, VectorType hwVectorType, DenseMap *substitutionsMap) { auto it = substitutionsMap->find(v); if (it == substitutionsMap->end()) { auto *opInst = v->getDefiningOp(); if (isa(opInst)) { OpBuilder b(opInst); auto *op = instantiate(b, opInst, hwVectorType, substitutionsMap); auto res = substitutionsMap->insert(std::make_pair(v, op->getResult(0))); assert(res.second && "Insertion failed"); return res.first->second; } v->getDefiningOp()->emitError("Missing substitution"); return nullptr; } return it->second; } /// Returns a list of single result AffineApplyOps that reindex the /// `memRefIndices` by the multi-dimensional `hwVectorInstance`. This is used by /// the function that materializes a vector.transfer operation to use hardware /// vector types instead of super-vector types. /// /// The general problem this function solves is as follows: /// Assume a vector.transfer operation at the super-vector granularity that has /// `l` enclosing loops (AffineForOp). Assume the vector transfer operation /// operates on a MemRef of rank `r`, a super-vector of rank `s` and a hardware /// vector of rank `h`. For the purpose of illustration assume l==4, r==3, s==2, /// h==1 and that the super-vector is vector<3x32xf32> and the hardware vector /// is vector<8xf32>. Assume the following MLIR snippet after /// super-vectorization has been applied: /// /// ```mlir /// affine.for %i0 = 0 to %M { /// affine.for %i1 = 0 to %N step 3 { /// affine.for %i2 = 0 to %O { /// affine.for %i3 = 0 to %P step 32 { /// %r = vector.transfer_read(%A, map0(%i..), map1(%i..), map2(%i..)) : /// vector<3x32xf32>, memref /// ... /// }}}} /// ``` /// /// where map denotes an AffineMap operating on enclosing loops with properties /// compatible for vectorization (i.e. some contiguity left unspecified here). /// Note that the vectorized loops are %i1 and %i3. /// This function translates the vector.transfer_read operation to multiple /// instances of vector.transfer_read that operate on vector<8x32>. /// /// Without loss of generality, we assume hwVectorInstance is: {2, 1}. /// The only constraints on hwVectorInstance is they belong to: /// [0, 2] x [0, 3], which is the span of ratio of super-vector shape to /// hardware vector shape in our example. /// /// This function instantiates the iteration <2, 1> of vector.transfer_read /// into the set of operations in pseudo-MLIR: /// /// ```mlir /// #map2 = (d0, d1, d2, d3) -> (d0, d1 + 2, d2, d3 + 1 * 8) /// #map3 = #map o #map2 // where o denotes composition /// aff0 = affine.apply #map3.0(%i..) /// aff1 = affine.apply #map3.1(%i..) /// aff2 = affine.apply #map3.2(%i..) /// %r = vector.transfer_read(%A, %aff0, %aff1, %aff2): // vector<3x32xf32>, memref /// ``` /// /// Practical considerations /// ======================== /// For now, `map` is assumed to be the identity map and the indices are /// specified just as vector.transfer_read%A[%i0, %i1, %i2, %i3]. This will be /// extended in the future once we have a proper Op for vector transfers. /// Additionally, the example above is specified in pseudo-MLIR form; once we /// have proper support for generic maps we can generate the code and show /// actual MLIR. /// /// TODO(ntv): support a concrete AffineMap and compose with it. /// TODO(ntv): these implementation details should be captured in a /// vectorization trait at the op level directly. static SmallVector reindexAffineIndices(OpBuilder b, VectorType hwVectorType, ArrayRef hwVectorInstance, ArrayRef memrefIndices) { auto vectorShape = hwVectorType.getShape(); assert(hwVectorInstance.size() >= vectorShape.size()); unsigned numIndices = memrefIndices.size(); auto numMemRefIndices = numIndices - hwVectorInstance.size(); auto numVectorIndices = hwVectorInstance.size() - vectorShape.size(); SmallVector affineExprs; // TODO(ntv): support a concrete map and composition. unsigned i = 0; // The first numMemRefIndices correspond to AffineForOp that have not been // vectorized, the transformation is the identity on those. for (i = 0; i < numMemRefIndices; ++i) { auto d_i = b.getAffineDimExpr(i); affineExprs.push_back(d_i); } // The next numVectorIndices correspond to super-vector dimensions that // do not have a hardware vector dimension counterpart. For those we only // need to increment the index by the corresponding hwVectorInstance. for (i = numMemRefIndices; i < numMemRefIndices + numVectorIndices; ++i) { auto d_i = b.getAffineDimExpr(i); auto offset = hwVectorInstance[i - numMemRefIndices]; affineExprs.push_back(d_i + offset); } // The remaining indices correspond to super-vector dimensions that // have a hardware vector dimension counterpart. For those we to increment the // index by "hwVectorInstance" multiples of the corresponding hardware // vector size. for (; i < numIndices; ++i) { auto d_i = b.getAffineDimExpr(i); auto offset = hwVectorInstance[i - numMemRefIndices]; auto stride = vectorShape[i - numMemRefIndices - numVectorIndices]; affineExprs.push_back(d_i + offset * stride); } // Create a bunch of single result AffineApplyOp. SmallVector res; res.reserve(affineExprs.size()); for (auto expr : affineExprs) { auto map = AffineMap::get(numIndices, 0, expr); res.push_back(makeComposedAffineApply(b, b.getInsertionPoint()->getLoc(), map, memrefIndices)); } return res; } /// Returns attributes with the following substitutions applied: /// - constant splat is replaced by constant splat of `hwVectorType`. /// TODO(ntv): add more substitutions on a per-need basis. static SmallVector materializeAttributes(Operation *opInst, VectorType hwVectorType) { SmallVector res; for (auto a : opInst->getAttrs()) { if (auto splat = a.second.dyn_cast()) { auto attr = SplatElementsAttr::get(hwVectorType, splat.getSplatValue()); res.push_back(NamedAttribute(a.first, attr)); } else { res.push_back(a); } } return res; } /// Creates an instantiated version of `opInst`. /// Ops other than VectorTransferReadOp/VectorTransferWriteOp require no /// affine reindexing. Just substitute their Value operands and be done. For /// this case the actual instance is irrelevant. Just use the values in /// substitutionsMap. /// /// If the underlying substitution fails, this fails too and returns nullptr. static Operation *instantiate(OpBuilder b, Operation *opInst, VectorType hwVectorType, DenseMap *substitutionsMap) { assert(!isa(opInst) && "Should call the function specialized for VectorTransferReadOp"); assert(!isa(opInst) && "Should call the function specialized for VectorTransferWriteOp"); if (opInst->getNumRegions() != 0) return nullptr; bool fail = false; auto operands = map( [hwVectorType, substitutionsMap, &fail](Value *v) -> Value * { auto *res = fail ? nullptr : substitute(v, hwVectorType, substitutionsMap); fail |= !res; return res; }, opInst->getOperands()); if (fail) return nullptr; auto attrs = materializeAttributes(opInst, hwVectorType); OperationState state(opInst->getLoc(), opInst->getName().getStringRef(), operands, {hwVectorType}, attrs); return b.createOperation(state); } /// Computes the permutationMap required for a VectorTransferOp from the memref /// to the `hwVectorType`. /// This is achieved by returning the projection of the permutationMap along the /// dimensions of the super-vector type that remain in the hwVectorType. /// In particular, if a dimension is fully instantiated (i.e. unrolled) then it /// is projected out in the final result. template static AffineMap projectedPermutationMap(VectorTransferOpTy transfer, VectorType hwVectorType) { static_assert( std::is_same::value || std::is_same::value, "Must be called on a VectorTransferOp"); auto superVectorType = transfer.getVectorType(); auto optionalRatio = shapeRatio(superVectorType, hwVectorType); assert(optionalRatio && (optionalRatio->size() == superVectorType.getShape().size()) && "Shape and ratio not of the same size"); unsigned dim = 0; SmallVector keep; MLIRContext *context = transfer.getContext(); functional::zipApply( [&dim, &keep, context](int64_t shape, int64_t ratio) { assert(shape >= ratio && "shape dim must be greater than ratio dim"); if (shape != ratio) { // HW vector is not full instantiated along this dim, keep it. keep.push_back(getAffineDimExpr(dim, context)); } ++dim; }, superVectorType.getShape(), *optionalRatio); auto permutationMap = transfer.getPermutationMap(); LLVM_DEBUG(permutationMap.print(dbgs() << "\npermutationMap: ")); if (keep.empty()) { return permutationMap; } auto projectionMap = AffineMap::get(optionalRatio->size(), 0, keep); LLVM_DEBUG(projectionMap.print(dbgs() << "\nprojectionMap: ")); return simplifyAffineMap(projectionMap.compose(permutationMap)); } /// Creates an instantiated version of `read` for the instance of /// `hwVectorInstance` when lowering from a super-vector type to /// `hwVectorType`. `hwVectorInstance` represents one particular instance of /// `hwVectorType` int the covering of the super-vector type. For a more /// detailed description of the problem, see the description of /// reindexAffineIndices. static Operation *instantiate(OpBuilder b, VectorTransferReadOp read, VectorType hwVectorType, ArrayRef hwVectorInstance, DenseMap *substitutionsMap) { SmallVector indices = map(makePtrDynCaster(), read.getIndices()); auto affineIndices = reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices); auto map = projectedPermutationMap(read, hwVectorType); if (!map) { return nullptr; } auto cloned = b.create(read.getLoc(), hwVectorType, read.getMemRef(), affineIndices, map, read.getPaddingValue()); return cloned.getOperation(); } /// Creates an instantiated version of `write` for the instance of /// `hwVectorInstance` when lowering from a super-vector type to /// `hwVectorType`. `hwVectorInstance` represents one particular instance of /// `hwVectorType` int the covering of th3e super-vector type. For a more /// detailed description of the problem, see the description of /// reindexAffineIndices. static Operation *instantiate(OpBuilder b, VectorTransferWriteOp write, VectorType hwVectorType, ArrayRef hwVectorInstance, DenseMap *substitutionsMap) { SmallVector indices = map(makePtrDynCaster(), write.getIndices()); auto affineIndices = reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices); auto cloned = b.create( write.getLoc(), substitute(write.getVector(), hwVectorType, substitutionsMap), write.getMemRef(), affineIndices, projectedPermutationMap(write, hwVectorType)); return cloned.getOperation(); } /// Returns `true` if op instance is properly cloned and inserted, false /// otherwise. /// The multi-dimensional `hwVectorInstance` belongs to the shapeRatio of /// super-vector type to hw vector type. /// A cloned instance of `op` is formed as follows: /// 1. vector.transfer_read: the return `superVectorType` is replaced by /// `hwVectorType`. Additionally, affine indices are reindexed with /// `reindexAffineIndices` using `hwVectorInstance` and vector type /// information; /// 2. vector.transfer_write: the `valueToStore` type is simply substituted. /// Since we operate on a topologically sorted slice, a substitution must /// have been registered for non-constant ops. Additionally, affine indices /// are reindexed in the same way as for vector.transfer_read; /// 3. constant ops are splats of the super-vector type by construction. /// They are cloned to a splat on the hw vector type with the same value; /// 4. remaining ops are cloned to version of the op that returns a hw vector /// type, all operands are substituted according to `substitutions`. Thanks /// to the topological order of a slice, the substitution is always /// possible. /// /// Returns true on failure. static bool instantiateMaterialization(Operation *op, MaterializationState *state) { LLVM_DEBUG(dbgs() << "\ninstantiate: " << *op); // Create a builder here for unroll-and-jam effects. OpBuilder b(op); // AffineApplyOp are ignored: instantiating the proper vector op will take // care of AffineApplyOps by composing them properly. if (isa(op)) { return false; } if (op->getNumRegions() != 0) return op->emitError("NYI path Op with region"), true; if (auto write = dyn_cast(op)) { auto *clone = instantiate(b, write, state->hwVectorType, state->hwVectorInstance, state->substitutionsMap); return clone == nullptr; } if (auto read = dyn_cast(op)) { auto *clone = instantiate(b, read, state->hwVectorType, state->hwVectorInstance, state->substitutionsMap); if (!clone) { return true; } state->substitutionsMap->insert( std::make_pair(read.getResult(), clone->getResult(0))); return false; } // The only op with 0 results reaching this point must, by construction, be // VectorTransferWriteOps and have been caught above. Ops with >= 2 results // are not yet supported. So just support 1 result. if (op->getNumResults() != 1) { return op->emitError("NYI: ops with != 1 results"), true; } if (op->getResult(0)->getType() != state->superVectorType) { return op->emitError("Op does not return a supervector."), true; } auto *clone = instantiate(b, op, state->hwVectorType, state->substitutionsMap); if (!clone) { return true; } state->substitutionsMap->insert( std::make_pair(op->getResult(0), clone->getResult(0))); return false; } /// Takes a slice and rewrites the operations in it so that occurrences /// of `superVectorType` are replaced by `hwVectorType`. /// /// Implementation /// ============== /// 1. computes the shape ratio of super-vector to HW vector shapes. This /// gives for each op in the slice, how many instantiations are required /// in each dimension; /// 2. performs the concrete materialization. Note that in a first /// implementation we use full unrolling because it pragmatically removes /// the need to explicitly materialize an AllocOp. Thanks to the properties /// of super-vectors, this unrolling is always possible and simple: /// vectorizing to a super-vector abstraction already achieved the /// equivalent of loop strip-mining + loop sinking and encoded this in the /// vector type. /// /// Returns true on failure. /// /// TODO(ntv): materialized allocs. /// TODO(ntv): full loops + materialized allocs. /// TODO(ntv): partial unrolling + materialized allocs. static bool emitSlice(MaterializationState *state, SetVector *slice) { auto ratio = shapeRatio(state->superVectorType, state->hwVectorType); assert(ratio.hasValue() && "ratio of super-vector to HW-vector shape is not integral"); // The number of integer points in a hyperrectangular region is: // shape[0] * strides[0]. auto numValueToUnroll = (*ratio)[0] * makeStrides(*ratio)[0]; // Full unrolling to hardware vectors in a first approximation. for (unsigned idx = 0; idx < numValueToUnroll; ++idx) { // Fresh RAII instanceIndices and substitutionsMap. MaterializationState scopedState = *state; scopedState.hwVectorInstance = delinearize(idx, *ratio); DenseMap substitutionMap; scopedState.substitutionsMap = &substitutionMap; // slice are topologically sorted, we can just clone them in order. for (auto *op : *slice) { auto fail = instantiateMaterialization(op, &scopedState); if (fail) { op->emitError("Unhandled super-vector materialization failure"); return true; } } } LLVM_DEBUG(dbgs() << "\nMLFunction is now\n"); LLVM_DEBUG((*slice)[0]->getFunction()->print(dbgs())); // slice are topologically sorted, we can just erase them in reverse // order. Reverse iterator does not just work simply with an operator* // dereference. for (int idx = slice->size() - 1; idx >= 0; --idx) { LLVM_DEBUG(dbgs() << "\nErase: "); LLVM_DEBUG((*slice)[idx]->print(dbgs())); (*slice)[idx]->erase(); } return false; } /// Materializes super-vector types into concrete hw vector types as follows: /// 1. start from super-vector terminators (current vector.transfer_write /// ops); /// 2. collect all the operations that can be reached by transitive use-defs /// chains; /// 3. get the superVectorType for this particular terminator and the /// corresponding hardware vector type (for now limited to F32) /// TODO(ntv): be more general than F32. /// 4. emit the transitive useDef set to operate on the finer-grain vector /// types. /// /// Notes /// ===== /// The `slice` is sorted in topological order by construction. /// Additionally, this set is limited to operations in the same lexical scope /// because we currently disallow vectorization of defs that come from another /// scope. /// TODO(ntv): please document return value. static bool materialize(Function *f, const SetVector &terminators, MaterializationState *state) { DenseSet seen; DominanceInfo domInfo(f); for (auto *term : terminators) { // Short-circuit test, a given terminator may have been reached by some // other previous transitive use-def chains. if (seen.count(term) > 0) { continue; } auto terminator = cast(term); LLVM_DEBUG(dbgs() << "\nFrom terminator:" << *term); // Get the transitive use-defs starting from terminator, limited to the // current enclosing scope of the terminator. See the top of the function // Note for the justification of this restriction. // TODO(ntv): relax scoping constraints. auto *enclosingScope = term->getParentOp(); auto keepIfInSameScope = [enclosingScope, &domInfo](Operation *op) { assert(op && "NULL op"); if (!enclosingScope) { // by construction, everyone is always under the top scope (null scope). return true; } return domInfo.properlyDominates(enclosingScope, op); }; SetVector slice = getSlice(term, keepIfInSameScope, keepIfInSameScope); assert(!slice.empty()); // Sanity checks: transitive slice must be completely disjoint from // what we have seen so far. LLVM_DEBUG(dbgs() << "\nTransitive use-defs:"); for (auto *ud : slice) { LLVM_DEBUG(dbgs() << "\nud:" << *ud); assert(seen.count(ud) == 0 && "Transitive use-defs not disjoint from already seen"); seen.insert(ud); } // Emit the current slice. // Set scoped super-vector and corresponding hw vector types. state->superVectorType = terminator.getVectorType(); assert((state->superVectorType.getElementType() == FloatType::getF32(term->getContext())) && "Only f32 supported for now"); state->hwVectorType = VectorType::get( state->hwVectorSize, state->superVectorType.getElementType()); auto fail = emitSlice(state, &slice); if (fail) { return true; } LLVM_DEBUG(dbgs() << "\nMLFunction is now\n"); LLVM_DEBUG(f->print(dbgs())); } return false; } void MaterializeVectorsPass::runOnFunction() { // Thread-safe RAII local context, BumpPtrAllocator freed on exit. NestedPatternContext mlContext; // TODO(ntv): Check to see if this supports arbitrary top-level code. Function *f = &getFunction(); if (f->getBlocks().size() != 1) return; using matcher::Op; LLVM_DEBUG(dbgs() << "\nMaterializeVectors on Function\n"); LLVM_DEBUG(f->print(dbgs())); MaterializationState state(hwVectorSize); // Get the hardware vector type. // TODO(ntv): get elemental type from super-vector type rather than force f32. auto subVectorType = VectorType::get(hwVectorSize, FloatType::getF32(&getContext())); // Capture terminators; i.e. vector.transfer_write ops involving a strict // super-vector of subVectorType. auto filter = [subVectorType](Operation &op) { if (!isa(op)) { return false; } return matcher::operatesOnSuperVectorsOf(op, subVectorType); }; auto pat = Op(filter); SmallVector matches; pat.match(f, &matches); SetVector terminators; for (auto m : matches) { terminators.insert(m.getMatchedOperation()); } if (materialize(f, terminators, &state)) signalPassFailure(); } FunctionPassBase * mlir::createMaterializeVectorsPass(llvm::ArrayRef vectorSize) { return new MaterializeVectorsPass(vectorSize); } static PassRegistration pass("affine-materialize-vectors", "Materializes super-vectors to vectors of the " "proper size for the hardware"); #undef DEBUG_TYPE