summaryrefslogtreecommitdiffstats
path: root/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp')
-rw-r--r--mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp112
1 files changed, 54 insertions, 58 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index 24bb8ffc462..e500d10983c 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -34,7 +34,7 @@ using namespace mlir::loop;
using llvm::seq;
// Extract an indexed value from KernelDim3.
-static ValuePtr getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
+static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
switch (pos) {
case 0:
return dim3.x;
@@ -52,8 +52,8 @@ static ValuePtr getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
return forOp.getLowerBoundOperands();
}
-static SmallVector<ValuePtr, 1> getLowerBoundOperands(ForOp forOp) {
- SmallVector<ValuePtr, 1> bounds(1, forOp.lowerBound());
+static SmallVector<Value, 1> getLowerBoundOperands(ForOp forOp) {
+ SmallVector<Value, 1> bounds(1, forOp.lowerBound());
return bounds;
}
@@ -61,35 +61,33 @@ static SmallVector<ValuePtr, 1> getLowerBoundOperands(ForOp forOp) {
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
return forOp.getUpperBoundOperands();
}
-static SmallVector<ValuePtr, 1> getUpperBoundOperands(ForOp forOp) {
- SmallVector<ValuePtr, 1> bounds(1, forOp.upperBound());
+static SmallVector<Value, 1> getUpperBoundOperands(ForOp forOp) {
+ SmallVector<Value, 1> bounds(1, forOp.upperBound());
return bounds;
}
// Get a Value that corresponds to the loop step. If the step is an attribute,
// materialize a corresponding constant using builder.
-static ValuePtr getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
+static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep());
}
-static ValuePtr getOrCreateStep(ForOp forOp, OpBuilder &) {
- return forOp.step();
-}
+static Value getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); }
// Get a Value for the loop lower bound. If the value requires computation,
// materialize the instructions using builder.
-static ValuePtr getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
+static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
return lowerAffineLowerBound(forOp, builder);
}
-static ValuePtr getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
+static Value getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
return forOp.lowerBound();
}
// Get a Value for the loop upper bound. If the value requires computation,
// materialize the instructions using builder.
-static ValuePtr getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
+static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
return lowerAffineUpperBound(forOp, builder);
}
-static ValuePtr getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
+static Value getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
return forOp.upperBound();
}
@@ -205,18 +203,18 @@ struct LoopToGpuConverter {
unsigned numThreadDims);
// Ranges of the loops mapped to blocks or threads.
- SmallVector<ValuePtr, 6> dims;
+ SmallVector<Value, 6> dims;
// Lower bounds of the loops mapped to blocks or threads.
- SmallVector<ValuePtr, 6> lbs;
+ SmallVector<Value, 6> lbs;
// Induction variables of the loops mapped to blocks or threads.
- SmallVector<ValuePtr, 6> ivs;
+ SmallVector<Value, 6> ivs;
// Steps of the loops mapped to blocks or threads.
- SmallVector<ValuePtr, 6> steps;
+ SmallVector<Value, 6> steps;
};
} // namespace
// Return true if the value is obviously a constant "one".
-static bool isConstantOne(ValuePtr value) {
+static bool isConstantOne(Value value) {
if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp()))
return def.getValue() == 1;
return false;
@@ -237,15 +235,15 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
steps.reserve(numLoops);
OpTy currentLoop = forOp;
for (unsigned i = 0; i < numLoops; ++i) {
- ValuePtr lowerBound = getOrEmitLowerBound(currentLoop, builder);
- ValuePtr upperBound = getOrEmitUpperBound(currentLoop, builder);
+ Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
+ Value upperBound = getOrEmitUpperBound(currentLoop, builder);
if (!lowerBound || !upperBound) {
return llvm::None;
}
- ValuePtr range =
+ Value range =
builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
- ValuePtr step = getOrCreateStep(currentLoop, builder);
+ Value step = getOrCreateStep(currentLoop, builder);
if (!isConstantOne(step))
range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step);
dims.push_back(range);
@@ -267,8 +265,8 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
/// `nids`. The innermost loop is mapped to the x-dimension, followed by the
/// next innermost loop to y-dimension, followed by z-dimension.
template <typename OpTy>
-OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<ValuePtr> ids,
- ArrayRef<ValuePtr> nids) {
+OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value> ids,
+ ArrayRef<Value> nids) {
auto nDims = ids.size();
assert(nDims == nids.size());
for (auto dim : llvm::seq<unsigned>(0, nDims)) {
@@ -288,11 +286,11 @@ OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<ValuePtr> ids,
/// each workgroup/workitem and number of workgroup/workitems along a dimension
/// of the launch into a container.
void packIdAndNumId(gpu::KernelDim3 kernelIds, gpu::KernelDim3 kernelNids,
- unsigned nDims, SmallVectorImpl<ValuePtr> &ids,
- SmallVectorImpl<ValuePtr> &nids) {
+ unsigned nDims, SmallVectorImpl<Value> &ids,
+ SmallVectorImpl<Value> &nids) {
assert(nDims <= 3 && "invalid number of launch dimensions");
- SmallVector<ValuePtr, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
- SmallVector<ValuePtr, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
+ SmallVector<Value, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
+ SmallVector<Value, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
ids.clear();
ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end());
nids.clear();
@@ -310,7 +308,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc());
rootForOp.getOperation()->moveBefore(returnOp);
- SmallVector<ValuePtr, 3> workgroupID, numWorkGroups;
+ SmallVector<Value, 3> workgroupID, numWorkGroups;
packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
workgroupID, numWorkGroups);
@@ -326,7 +324,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
}
}
- SmallVector<ValuePtr, 3> workItemID, workGroupSize;
+ SmallVector<Value, 3> workItemID, workGroupSize;
packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(),
numThreadDims, workItemID, workGroupSize);
for (auto &loopOp : threadRootForOps) {
@@ -339,18 +337,17 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
// Convert the computation rooted at the `rootForOp`, into a GPU kernel with the
// given workgroup size and number of workgroups.
template <typename OpTy>
-LogicalResult createLaunchFromOp(OpTy rootForOp,
- ArrayRef<ValuePtr> numWorkGroups,
- ArrayRef<ValuePtr> workGroupSizes) {
+LogicalResult createLaunchFromOp(OpTy rootForOp, ArrayRef<Value> numWorkGroups,
+ ArrayRef<Value> workGroupSizes) {
OpBuilder builder(rootForOp.getOperation());
if (numWorkGroups.size() > 3) {
return rootForOp.emitError("invalid ")
<< numWorkGroups.size() << "-D workgroup specification";
}
auto loc = rootForOp.getLoc();
- ValuePtr one = builder.create<ConstantOp>(
+ Value one = builder.create<ConstantOp>(
loc, builder.getIntegerAttr(builder.getIndexType(), 1));
- SmallVector<ValuePtr, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
+ SmallVector<Value, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
for (auto numWorkGroup : enumerate(numWorkGroups)) {
numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value();
}
@@ -360,7 +357,7 @@ LogicalResult createLaunchFromOp(OpTy rootForOp,
// Get the values used within the region of the rootForOp but defined above
// it.
- llvm::SetVector<ValuePtr> valuesToForwardSet;
+ llvm::SetVector<Value> valuesToForwardSet;
getUsedValuesDefinedAbove(rootForOp.region(), rootForOp.region(),
valuesToForwardSet);
// Also add the values used for the lb, ub, and step of the rootForOp.
@@ -380,8 +377,8 @@ LogicalResult createLaunchFromOp(OpTy rootForOp,
// defined outside. They all are replaced with kernel arguments.
for (const auto &pair :
llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
- ValuePtr from = std::get<0>(pair);
- ValuePtr to = std::get<1>(pair);
+ Value from = std::get<0>(pair);
+ Value to = std::get<1>(pair);
replaceAllUsesInRegionWith(from, to, launchOp.body());
}
return success();
@@ -401,23 +398,22 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
OpBuilder builder(rootForOp.getOperation());
// Prepare the grid and block sizes for the launch operation. If there is
// no loop mapped to a specific dimension, use constant "1" as its size.
- ValuePtr constOne =
- (numBlockDims < 3 || numThreadDims < 3)
- ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
- : nullptr;
- ValuePtr gridSizeX = dims[0];
- ValuePtr gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
- ValuePtr gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
- ValuePtr blockSizeX = dims[numBlockDims];
- ValuePtr blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
- ValuePtr blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
+ Value constOne = (numBlockDims < 3 || numThreadDims < 3)
+ ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
+ : nullptr;
+ Value gridSizeX = dims[0];
+ Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
+ Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
+ Value blockSizeX = dims[numBlockDims];
+ Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
+ Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
// Create a launch op and move the body region of the innermost loop to the
// launch op. Pass the values defined outside the outermost loop and used
// inside the innermost loop and loop lower bounds as kernel data arguments.
// Still assuming perfect nesting so there are no values other than induction
// variables that are defined in one loop and used in deeper loops.
- llvm::SetVector<ValuePtr> valuesToForwardSet;
+ llvm::SetVector<Value> valuesToForwardSet;
getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(),
valuesToForwardSet);
auto valuesToForward = valuesToForwardSet.takeVector();
@@ -451,15 +447,15 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
originallyForwardedValues);
auto stepArgumentIt = std::next(lbArgumentIt, lbs.size());
for (auto en : llvm::enumerate(ivs)) {
- ValuePtr id =
+ Value id =
en.index() < numBlockDims
? getDim3Value(launchOp.getBlockIds(), en.index())
: getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
- ValuePtr step = steps[en.index()];
+ Value step = steps[en.index()];
if (!isConstantOne(step))
id = builder.create<MulIOp>(rootForOp.getLoc(), step, id);
- ValuePtr ivReplacement =
+ Value ivReplacement =
builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
en.value()->replaceAllUsesWith(ivReplacement);
replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt,
@@ -473,8 +469,8 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
// trailing positions, make sure we don't touch those.
for (const auto &pair :
llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
- ValuePtr from = std::get<0>(pair);
- ValuePtr to = std::get<1>(pair);
+ Value from = std::get<0>(pair);
+ Value to = std::get<1>(pair);
replaceAllUsesInRegionWith(from, to, launchOp.body());
}
@@ -504,8 +500,8 @@ static LogicalResult convertLoopNestToGPULaunch(OpTy forOp,
// nested. The workgroup size and num workgroups is provided as input
template <typename OpTy>
static LogicalResult convertLoopToGPULaunch(OpTy forOp,
- ArrayRef<ValuePtr> numWorkGroups,
- ArrayRef<ValuePtr> workGroupSize) {
+ ArrayRef<Value> numWorkGroups,
+ ArrayRef<Value> workGroupSize) {
if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(),
workGroupSize.size()))) {
return failure();
@@ -526,7 +522,7 @@ LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp,
}
LogicalResult mlir::convertLoopToGPULaunch(loop::ForOp forOp,
- ArrayRef<ValuePtr> numWorkGroups,
- ArrayRef<ValuePtr> workGroupSizes) {
+ ArrayRef<Value> numWorkGroups,
+ ArrayRef<Value> workGroupSizes) {
return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes);
}
OpenPOWER on IntegriCloud