diff options
Diffstat (limited to 'mlir/lib/Dialect/GPU/Transforms')
-rw-r--r-- | mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp new file mode 100644 index 00000000000..f01a430a216 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp @@ -0,0 +1,173 @@ +//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utilities that allow one to create IR moving the data +// across different levels of the GPU memory hierarchy. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/MemoryPromotion.h" +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/EDSC/Builders.h" +#include "mlir/EDSC/Helpers.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/Functional.h" +#include "mlir/Transforms/LoopUtils.h" + +using namespace mlir; +using namespace mlir::gpu; + +/// Returns the textual name of a GPU dimension. +static StringRef getDimName(unsigned dim) { + if (dim == 0) + return "x"; + if (dim == 1) + return "y"; + if (dim == 2) + return "z"; + + llvm_unreachable("dimension ID overflow"); +} + +/// Emits the (imperfect) loop nest performing the copy between "from" and "to" +/// values using the bounds derived from the "from" value. Emits at least +/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with +/// single-iteration loops. Maps the innermost loops to thread dimensions, in +/// reverse order to enable access coalescing in the innermost loop. +static void insertCopyLoops(OpBuilder &builder, Location loc, + edsc::MemRefView &bounds, Value from, Value to) { + // Create EDSC handles for bounds. + unsigned rank = bounds.rank(); + SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps; + + // Make sure we have enough loops to use all thread dimensions, these trivial + // loops should be outermost and therefore inserted first. + if (rank < GPUDialect::getNumWorkgroupDimensions()) { + unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank; + edsc::ValueHandle zero = edsc::intrinsics::constant_index(0); + edsc::ValueHandle one = edsc::intrinsics::constant_index(1); + lbs.resize(extraLoops, zero); + ubs.resize(extraLoops, one); + steps.resize(extraLoops, one); + } + + // Add existing bonuds. + lbs.append(bounds.getLbs().begin(), bounds.getLbs().end()); + ubs.append(bounds.getUbs().begin(), bounds.getUbs().end()); + + // Emit constant operations for steps. + steps.reserve(lbs.size()); + llvm::transform( + bounds.getSteps(), std::back_inserter(steps), + [](int64_t step) { return edsc::intrinsics::constant_index(step); }); + + // Obtain thread identifiers and block sizes, necessary to map to them. + auto indexType = builder.getIndexType(); + SmallVector<Value, 3> threadIds, blockDims; + for (unsigned i = 0; i < 3; ++i) { + auto dimName = builder.getStringAttr(getDimName(i)); + threadIds.push_back( + builder.create<gpu::ThreadIdOp>(loc, indexType, dimName)); + blockDims.push_back( + builder.create<gpu::BlockDimOp>(loc, indexType, dimName)); + } + + // Produce the loop nest with copies. + auto ivs = edsc::makeIndexHandles(lbs.size()); + auto ivPtrs = + edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs)); + edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() { + auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank); + edsc::StdIndexedValue fromHandle(from), toHandle(to); + toHandle(activeIvs) = fromHandle(activeIvs); + }); + + // Map the innermost loops to threads in reverse order. + for (auto en : + llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back( + GPUDialect::getNumWorkgroupDimensions())))) { + auto loop = cast<loop::ForOp>( + en.value().getValue().getParentRegion()->getParentOp()); + mapLoopToProcessorIds(loop, {threadIds[en.index()]}, + {blockDims[en.index()]}); + } +} + +/// Emits the loop nests performing the copy to the designated location in the +/// beginning of the region, and from the designated location immediately before +/// the terminator of the first block of the region. The region is expected to +/// have one block. This boils down to the following structure +/// +/// ^bb(...): +/// <loop-bound-computation> +/// for %arg0 = ... to ... step ... { +/// ... +/// for %argN = <thread-id-x> to ... step <block-dim-x> { +/// %0 = load %from[%arg0, ..., %argN] +/// store %0, %to[%arg0, ..., %argN] +/// } +/// ... +/// } +/// gpu.barrier +/// <... original body ...> +/// gpu.barrier +/// for %arg0 = ... to ... step ... { +/// ... +/// for %argN = <thread-id-x> to ... step <block-dim-x> { +/// %1 = load %to[%arg0, ..., %argN] +/// store %1, %from[%arg0, ..., %argN] +/// } +/// ... +/// } +/// +/// Inserts the barriers unconditionally since different threads may be copying +/// values and reading them. An analysis would be required to eliminate barriers +/// in case where value is only used by the thread that copies it. Both copies +/// are inserted unconditionally, an analysis would be required to only copy +/// live-in and live-out values when necessary. This copies the entire memref +/// pointed to by "from". In case a smaller block would be sufficient, the +/// caller can create a subview of the memref and promote it instead. +static void insertCopies(Region ®ion, Location loc, Value from, Value to) { + auto fromType = from.getType().cast<MemRefType>(); + auto toType = to.getType().cast<MemRefType>(); + (void)fromType; + (void)toType; + assert(fromType.getShape() == toType.getShape()); + assert(fromType.getRank() != 0); + assert(has_single_element(region) && + "unstructured control flow not supported"); + + OpBuilder builder(region.getContext()); + builder.setInsertionPointToStart(®ion.front()); + + edsc::ScopedContext edscContext(builder, loc); + edsc::MemRefView fromView(from); + insertCopyLoops(builder, loc, fromView, from, to); + builder.create<gpu::BarrierOp>(loc); + + builder.setInsertionPoint(®ion.front().back()); + builder.create<gpu::BarrierOp>(loc); + insertCopyLoops(builder, loc, fromView, to, from); +} + +/// Promotes a function argument to workgroup memory in the given function. The +/// copies will be inserted in the beginning and in the end of the function. +void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { + Value value = op.getArgument(arg); + auto type = value.getType().dyn_cast<MemRefType>(); + assert(type && type.hasStaticShape() && "can only promote memrefs"); + + Value attribution = + op.addWorkgroupAttribution(type.getShape(), type.getElementType()); + + // Replace the uses first since only the original uses are currently present. + // Then insert the copies. + value.replaceAllUsesWith(attribution); + insertCopies(op.getBody(), op.getLoc(), value, attribution); +} |