//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// // // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a pass to automatically promote accessed memref regions // to buffers in a faster memory space that is explicitly managed, with the // necessary data movement operations performed through either regular // point-wise load/store's or DMAs. Such explicit copying (also referred to as // array packing/unpacking in the literature), when done on arrays that exhibit // reuse, results in near elimination of conflict misses, TLB misses, reduced // use of hardware prefetch streams, and reduced false sharing. It is also // necessary for hardware that explicitly managed levels in the memory // hierarchy, and where DMAs may have to be used. This optimization is often // performed on already tiled code. // //===----------------------------------------------------------------------===// #include "mlir/Analysis/Utils.h" #include "mlir/Dialect/AffineOps/AffineOps.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/Utils.h" #include "llvm/ADT/MapVector.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include #define DEBUG_TYPE "affine-data-copy-generate" using namespace mlir; static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); static llvm::cl::opt clFastMemoryCapacity( "affine-data-copy-generate-fast-mem-capacity", llvm::cl::desc( "Set fast memory space capacity in KiB (default: unlimited)"), llvm::cl::cat(clOptionsCategory)); static llvm::cl::opt clDma("affine-data-copy-generate-dma", llvm::cl::desc("Generate DMA instead of point-wise copy"), llvm::cl::cat(clOptionsCategory), llvm::cl::init(true)); static llvm::cl::opt clFastMemorySpace( "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1), llvm::cl::desc( "Fast memory space identifier for copy generation (default: 1)"), llvm::cl::cat(clOptionsCategory)); static llvm::cl::opt clSkipNonUnitStrideLoop( "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, llvm::cl::init(false), llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " "for copy placement"), llvm::cl::cat(clOptionsCategory)); namespace { /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by /// introducing copy operations to transfer data into `fastMemorySpace` and /// rewriting the original load's/store's to instead load/store from the /// allocated fast memory buffers. Additional options specify the identifier /// corresponding to the fast memory space and the amount of fast memory space /// available. The pass traverses through the nesting structure, recursing to /// inner levels if necessary to determine at what depth copies need to be /// placed so that the allocated buffers fit within the memory capacity /// provided. // TODO(bondhugula): We currently can't generate copies correctly when stores // are strided. Check for strided stores. struct AffineDataCopyGeneration : public FunctionPass { explicit AffineDataCopyGeneration( unsigned slowMemorySpace = 0, unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024, uint64_t fastMemCapacityBytes = (clFastMemoryCapacity.getNumOccurrences() > 0 ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB : std::numeric_limits::max()), bool generateDma = clDma, bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) : slowMemorySpace(other.slowMemorySpace), fastMemorySpace(other.fastMemorySpace), tagMemorySpace(other.tagMemorySpace), minDmaTransferSize(other.minDmaTransferSize), fastMemCapacityBytes(other.fastMemCapacityBytes), generateDma(other.generateDma), skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} void runOnFunction() override; LogicalResult runOnBlock(Block *block, DenseSet ©Nests); // Slow memory space associated with copies. const unsigned slowMemorySpace; // Fast memory space associated with copies. unsigned fastMemorySpace; // Memory space associated with DMA tags. unsigned tagMemorySpace; // Minimum DMA transfer size supported by the target in bytes. const int minDmaTransferSize; // Capacity of the faster memory space. uint64_t fastMemCapacityBytes; // If set, generate DMA operations instead of read/write. bool generateDma; // If set, ignore loops with steps other than 1. bool skipNonUnitStrideLoops; // Constant zero index to avoid too many duplicates. Value zeroIndex = nullptr; }; } // end anonymous namespace /// Generates copies for memref's living in 'slowMemorySpace' into newly created /// buffers in 'fastMemorySpace', and replaces memory operations to the former /// by the latter. Only load op's handled for now. /// TODO(bondhugula): extend this to store op's. std::unique_ptr> mlir::createAffineDataCopyGenerationPass( unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, int minDmaTransferSize, uint64_t fastMemCapacityBytes) { return std::make_unique( slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, fastMemCapacityBytes); } /// Generate copies for this block. The block is partitioned into separate /// ranges: each range is either a sequence of one or more operations starting /// and ending with an affine load or store op, or just an affine.forop (which /// could have other affine for op's nested within). LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block, DenseSet ©Nests) { if (block->empty()) return success(); AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, fastMemorySpace, tagMemorySpace, fastMemCapacityBytes}; // Every affine.forop in the block starts and ends a block range for copying; // in addition, a contiguous sequence of operations starting with a // load/store op but not including any copy nests themselves is also // identified as a copy block range. Straightline code (a contiguous chunk of // operations excluding AffineForOp's) are always assumed to not exhaust // memory. As a result, this approach is conservative in some cases at the // moment; we do a check later and report an error with location info. // TODO(bondhugula): An 'affine.if' operation is being treated similar to an // operation. 'affine.if''s could have 'affine.for's in them; // treat them separately. // Get to the first load, store, or for op (that is not a copy nest itself). auto curBegin = std::find_if(block->begin(), block->end(), [&](Operation &op) { return (isa(op) || isa(op) || isa(op)) && copyNests.count(&op) == 0; }); // Create [begin, end) ranges. auto it = curBegin; while (it != block->end()) { AffineForOp forOp; // If you hit a non-copy for loop, we will split there. if ((forOp = dyn_cast(&*it)) && copyNests.count(forOp) == 0) { // Perform the copying up unti this 'for' op first. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, copyNests); // Returns true if the footprint is known to exceed capacity. auto exceedsCapacity = [&](AffineForOp forOp) { Optional footprint = getMemoryFootprintBytes(forOp, /*memorySpace=*/0); return (footprint.hasValue() && static_cast(footprint.getValue()) > fastMemCapacityBytes); }; // If the memory footprint of the 'affine.for' loop is higher than fast // memory capacity (when provided), we recurse to copy at an inner level // until we find a depth at which footprint fits in fast mem capacity. If // the footprint can't be calculated, we assume for now it fits. Recurse // inside if footprint for 'forOp' exceeds capacity, or when // skipNonUnitStrideLoops is set and the step size is not one. bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 : exceedsCapacity(forOp); if (recurseInner) { // We'll recurse and do the copies at an inner level for 'forInst'. // Recurse onto the body of this loop. runOnBlock(forOp.getBody(), copyNests); } else { // We have enough capacity, i.e., copies will be computed for the // portion of the block until 'it', and for 'it', which is 'forOp'. Note // that for the latter, the copies are placed just before this loop (for // incoming copies) and right after (for outgoing ones). // Inner loop copies have their own scope - we don't thus update // consumed capacity. The footprint check above guarantees this inner // loop's footprint fits. affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, copyNests); } // Get to the next load or store op after 'forOp'. curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { return (isa(op) || isa(op) || isa(op)) && copyNests.count(&op) == 0; }); it = curBegin; } else { assert(copyNests.count(&*it) == 0 && "all copy nests generated should have been skipped above"); // We simply include this op in the current range and continue for more. ++it; } } // Generate the copy for the final block range. if (curBegin != block->end()) { // Can't be a terminator because it would have been skipped above. assert(!curBegin->isKnownTerminator() && "can't be a terminator"); // Exclude the affine terminator - hence, the std::prev. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), copyOptions, copyNests); } return success(); } void AffineDataCopyGeneration::runOnFunction() { FuncOp f = getFunction(); OpBuilder topBuilder(f.getBody()); zeroIndex = topBuilder.create(f.getLoc(), 0); // Nests that are copy-in's or copy-out's; the root AffineForOps of those // nests are stored herein. DenseSet copyNests; // Clear recorded copy nests. copyNests.clear(); for (auto &block : f) runOnBlock(&block, copyNests); // Promote any single iteration loops in the copy nests. for (auto nest : copyNests) { nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); } } static PassRegistration pass("affine-data-copy-generate", "Generate explicit copying for memory operations");