mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements utilities that allow one to create IR moving the data
// across different levels of the GPU memory hierarchy.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/MemoryPromotion.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LoopOps/LoopOps.h"
#include "mlir/EDSC/Builders.h"
#include "mlir/EDSC/Helpers.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/Functional.h"
#include "mlir/Transforms/LoopUtils.h"

using namespace mlir;
using namespace mlir::gpu;

/// Returns the textual name of a GPU dimension.
static StringRef getDimName(unsigned dim) {
  if (dim == 0)
    return "x";
  if (dim == 1)
    return "y";
  if (dim == 2)
    return "z";

  llvm_unreachable("dimension ID overflow");
}

/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
/// values using the bounds derived from the "from" value. Emits at least
/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
/// single-iteration loops. Maps the innermost loops to thread dimensions, in
/// reverse order to enable access coalescing in the innermost loop.
static void insertCopyLoops(OpBuilder &builder, Location loc,
                            edsc::MemRefView &bounds, Value from, Value to) {
  // Create EDSC handles for bounds.
  unsigned rank = bounds.rank();
  SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;

  // Make sure we have enough loops to use all thread dimensions, these trivial
  // loops should be outermost and therefore inserted first.
  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
    unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
    edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
    edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
    lbs.resize(extraLoops, zero);
    ubs.resize(extraLoops, one);
    steps.resize(extraLoops, one);
  }

  // Add existing bonuds.
  lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
  ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());

  // Emit constant operations for steps.
  steps.reserve(lbs.size());
  llvm::transform(
      bounds.getSteps(), std::back_inserter(steps),
      [](int64_t step) { return edsc::intrinsics::constant_index(step); });

  // Obtain thread identifiers and block sizes, necessary to map to them.
  auto indexType = builder.getIndexType();
  SmallVector<Value, 3> threadIds, blockDims;
  for (unsigned i = 0; i < 3; ++i) {
    auto dimName = builder.getStringAttr(getDimName(i));
    threadIds.push_back(
        builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
    blockDims.push_back(
        builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
  }

  // Produce the loop nest with copies.
  auto ivs = edsc::makeIndexHandles(lbs.size());
  auto ivPtrs =
      edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
  edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
    auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
    edsc::StdIndexedValue fromHandle(from), toHandle(to);
    toHandle(activeIvs) = fromHandle(activeIvs);
  });

  // Map the innermost loops to threads in reverse order.
  for (auto en :
       llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
           GPUDialect::getNumWorkgroupDimensions())))) {
    auto loop = cast<loop::ForOp>(
        en.value().getValue().getParentRegion()->getParentOp());
    mapLoopToProcessorIds(loop, {threadIds[en.index()]},
                          {blockDims[en.index()]});
  }
}

/// Emits the loop nests performing the copy to the designated location in the
/// beginning of the region, and from the designated location immediately before
/// the terminator of the first block of the region. The region is expected to
/// have one block. This boils down to the following structure
///
///   ^bb(...):
///     <loop-bound-computation>
///     for %arg0 = ... to ... step ... {
///       ...
///         for %argN = <thread-id-x> to ... step <block-dim-x> {
///           %0 = load %from[%arg0, ..., %argN]
///           store %0, %to[%arg0, ..., %argN]
///         }
///       ...
///     }
///     gpu.barrier
///     <... original body ...>
///     gpu.barrier
///     for %arg0 = ... to ... step ... {
///       ...
///         for %argN = <thread-id-x> to ... step <block-dim-x> {
///           %1 = load %to[%arg0, ..., %argN]
///           store %1, %from[%arg0, ..., %argN]
///         }
///       ...
///     }
///
/// Inserts the barriers unconditionally since different threads may be copying
/// values and reading them. An analysis would be required to eliminate barriers
/// in case where value is only used by the thread that copies it. Both copies
/// are inserted unconditionally, an analysis would be required to only copy
/// live-in and live-out values when necessary. This copies the entire memref
/// pointed to by "from". In case a smaller block would be sufficient, the
/// caller can create a subview of the memref and promote it instead.
static void insertCopies(Region &region, Location loc, Value from, Value to) {
  auto fromType = from.getType().cast<MemRefType>();
  auto toType = to.getType().cast<MemRefType>();
  (void)fromType;
  (void)toType;
  assert(fromType.getShape() == toType.getShape());
  assert(fromType.getRank() != 0);
  assert(has_single_element(region) &&
         "unstructured control flow not supported");

  OpBuilder builder(region.getContext());
  builder.setInsertionPointToStart(&region.front());

  edsc::ScopedContext edscContext(builder, loc);
  edsc::MemRefView fromView(from);
  insertCopyLoops(builder, loc, fromView, from, to);
  builder.create<gpu::BarrierOp>(loc);

  builder.setInsertionPoint(&region.front().back());
  builder.create<gpu::BarrierOp>(loc);
  insertCopyLoops(builder, loc, fromView, to, from);
}

/// Promotes a function argument to workgroup memory in the given function. The
/// copies will be inserted in the beginning and in the end of the function.
void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
  Value value = op.getArgument(arg);
  auto type = value.getType().dyn_cast<MemRefType>();
  assert(type && type.hasStaticShape() && "can only promote memrefs");

  Value attribution =
      op.addWorkgroupAttribution(type.getShape(), type.getElementType());

  // Replace the uses first since only the original uses are currently present.
  // Then insert the copies.
  value.replaceAllUsesWith(attribution);
  insertCopies(op.getBody(), op.getLoc(), value, attribution);
}