summaryrefslogtreecommitdiffstats
path: root/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp')
-rw-r--r--mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp111
1 files changed, 57 insertions, 54 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index d663ae105f2..3cbce7caa76 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -43,7 +43,7 @@ using namespace mlir::loop;
using llvm::seq;
// Extract an indexed value from KernelDim3.
-static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
+static ValuePtr getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
switch (pos) {
case 0:
return dim3.x;
@@ -61,8 +61,8 @@ static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
return forOp.getLowerBoundOperands();
}
-static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) {
- SmallVector<Value *, 1> bounds(1, forOp.lowerBound());
+static SmallVector<ValuePtr, 1> getLowerBoundOperands(ForOp forOp) {
+ SmallVector<ValuePtr, 1> bounds(1, forOp.lowerBound());
return bounds;
}
@@ -70,33 +70,35 @@ static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) {
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
return forOp.getUpperBoundOperands();
}
-static SmallVector<Value *, 1> getUpperBoundOperands(ForOp forOp) {
- SmallVector<Value *, 1> bounds(1, forOp.upperBound());
+static SmallVector<ValuePtr, 1> getUpperBoundOperands(ForOp forOp) {
+ SmallVector<ValuePtr, 1> bounds(1, forOp.upperBound());
return bounds;
}
// Get a Value that corresponds to the loop step. If the step is an attribute,
// materialize a corresponding constant using builder.
-static Value *getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
+static ValuePtr getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep());
}
-static Value *getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); }
+static ValuePtr getOrCreateStep(ForOp forOp, OpBuilder &) {
+ return forOp.step();
+}
// Get a Value for the loop lower bound. If the value requires computation,
// materialize the instructions using builder.
-static Value *getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
+static ValuePtr getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
return lowerAffineLowerBound(forOp, builder);
}
-static Value *getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
+static ValuePtr getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
return forOp.lowerBound();
}
// Get a Value for the loop upper bound. If the value requires computation,
// materialize the instructions using builder.
-static Value *getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
+static ValuePtr getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
return lowerAffineUpperBound(forOp, builder);
}
-static Value *getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
+static ValuePtr getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
return forOp.upperBound();
}
@@ -212,18 +214,18 @@ struct LoopToGpuConverter {
unsigned numThreadDims);
// Ranges of the loops mapped to blocks or threads.
- SmallVector<Value *, 6> dims;
+ SmallVector<ValuePtr, 6> dims;
// Lower bounds of the loops mapped to blocks or threads.
- SmallVector<Value *, 6> lbs;
+ SmallVector<ValuePtr, 6> lbs;
// Induction variables of the loops mapped to blocks or threads.
- SmallVector<Value *, 6> ivs;
+ SmallVector<ValuePtr, 6> ivs;
// Steps of the loops mapped to blocks or threads.
- SmallVector<Value *, 6> steps;
+ SmallVector<ValuePtr, 6> steps;
};
} // namespace
// Return true if the value is obviously a constant "one".
-static bool isConstantOne(Value *value) {
+static bool isConstantOne(ValuePtr value) {
if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp()))
return def.getValue() == 1;
return false;
@@ -244,15 +246,15 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
steps.reserve(numLoops);
OpTy currentLoop = forOp;
for (unsigned i = 0; i < numLoops; ++i) {
- Value *lowerBound = getOrEmitLowerBound(currentLoop, builder);
- Value *upperBound = getOrEmitUpperBound(currentLoop, builder);
+ ValuePtr lowerBound = getOrEmitLowerBound(currentLoop, builder);
+ ValuePtr upperBound = getOrEmitUpperBound(currentLoop, builder);
if (!lowerBound || !upperBound) {
return llvm::None;
}
- Value *range =
+ ValuePtr range =
builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
- Value *step = getOrCreateStep(currentLoop, builder);
+ ValuePtr step = getOrCreateStep(currentLoop, builder);
if (!isConstantOne(step))
range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step);
dims.push_back(range);
@@ -274,8 +276,8 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
/// `nids`. The innermost loop is mapped to the x-dimension, followed by the
/// next innermost loop to y-dimension, followed by z-dimension.
template <typename OpTy>
-OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value *> ids,
- ArrayRef<Value *> nids) {
+OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<ValuePtr> ids,
+ ArrayRef<ValuePtr> nids) {
auto nDims = ids.size();
assert(nDims == nids.size());
for (auto dim : llvm::seq<unsigned>(0, nDims)) {
@@ -295,11 +297,11 @@ OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value *> ids,
/// each workgroup/workitem and number of workgroup/workitems along a dimension
/// of the launch into a container.
void packIdAndNumId(gpu::KernelDim3 kernelIds, gpu::KernelDim3 kernelNids,
- unsigned nDims, SmallVectorImpl<Value *> &ids,
- SmallVectorImpl<Value *> &nids) {
+ unsigned nDims, SmallVectorImpl<ValuePtr> &ids,
+ SmallVectorImpl<ValuePtr> &nids) {
assert(nDims <= 3 && "invalid number of launch dimensions");
- SmallVector<Value *, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
- SmallVector<Value *, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
+ SmallVector<ValuePtr, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
+ SmallVector<ValuePtr, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
ids.clear();
ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end());
nids.clear();
@@ -317,7 +319,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc());
rootForOp.getOperation()->moveBefore(returnOp);
- SmallVector<Value *, 3> workgroupID, numWorkGroups;
+ SmallVector<ValuePtr, 3> workgroupID, numWorkGroups;
packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
workgroupID, numWorkGroups);
@@ -333,7 +335,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
}
}
- SmallVector<Value *, 3> workItemID, workGroupSize;
+ SmallVector<ValuePtr, 3> workItemID, workGroupSize;
packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(),
numThreadDims, workItemID, workGroupSize);
for (auto &loopOp : threadRootForOps) {
@@ -347,17 +349,17 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
// given workgroup size and number of workgroups.
template <typename OpTy>
LogicalResult createLaunchFromOp(OpTy rootForOp,
- ArrayRef<Value *> numWorkGroups,
- ArrayRef<Value *> workGroupSizes) {
+ ArrayRef<ValuePtr> numWorkGroups,
+ ArrayRef<ValuePtr> workGroupSizes) {
OpBuilder builder(rootForOp.getOperation());
if (numWorkGroups.size() > 3) {
return rootForOp.emitError("invalid ")
<< numWorkGroups.size() << "-D workgroup specification";
}
auto loc = rootForOp.getLoc();
- Value *one = builder.create<ConstantOp>(
+ ValuePtr one = builder.create<ConstantOp>(
loc, builder.getIntegerAttr(builder.getIndexType(), 1));
- SmallVector<Value *, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
+ SmallVector<ValuePtr, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
for (auto numWorkGroup : enumerate(numWorkGroups)) {
numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value();
}
@@ -367,7 +369,7 @@ LogicalResult createLaunchFromOp(OpTy rootForOp,
// Get the values used within the region of the rootForOp but defined above
// it.
- llvm::SetVector<Value *> valuesToForwardSet;
+ llvm::SetVector<ValuePtr> valuesToForwardSet;
getUsedValuesDefinedAbove(rootForOp.region(), rootForOp.region(),
valuesToForwardSet);
// Also add the values used for the lb, ub, and step of the rootForOp.
@@ -387,8 +389,8 @@ LogicalResult createLaunchFromOp(OpTy rootForOp,
// defined outside. They all are replaced with kernel arguments.
for (const auto &pair :
llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
- Value *from = std::get<0>(pair);
- Value *to = std::get<1>(pair);
+ ValuePtr from = std::get<0>(pair);
+ ValuePtr to = std::get<1>(pair);
replaceAllUsesInRegionWith(from, to, launchOp.body());
}
return success();
@@ -408,22 +410,23 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
OpBuilder builder(rootForOp.getOperation());
// Prepare the grid and block sizes for the launch operation. If there is
// no loop mapped to a specific dimension, use constant "1" as its size.
- Value *constOne = (numBlockDims < 3 || numThreadDims < 3)
- ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
- : nullptr;
- Value *gridSizeX = dims[0];
- Value *gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
- Value *gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
- Value *blockSizeX = dims[numBlockDims];
- Value *blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
- Value *blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
+ ValuePtr constOne =
+ (numBlockDims < 3 || numThreadDims < 3)
+ ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
+ : nullptr;
+ ValuePtr gridSizeX = dims[0];
+ ValuePtr gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
+ ValuePtr gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
+ ValuePtr blockSizeX = dims[numBlockDims];
+ ValuePtr blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
+ ValuePtr blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
// Create a launch op and move the body region of the innermost loop to the
// launch op. Pass the values defined outside the outermost loop and used
// inside the innermost loop and loop lower bounds as kernel data arguments.
// Still assuming perfect nesting so there are no values other than induction
// variables that are defined in one loop and used in deeper loops.
- llvm::SetVector<Value *> valuesToForwardSet;
+ llvm::SetVector<ValuePtr> valuesToForwardSet;
getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(),
valuesToForwardSet);
auto valuesToForward = valuesToForwardSet.takeVector();
@@ -457,15 +460,15 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
originallyForwardedValues);
auto stepArgumentIt = std::next(lbArgumentIt, lbs.size());
for (auto en : llvm::enumerate(ivs)) {
- Value *id =
+ ValuePtr id =
en.index() < numBlockDims
? getDim3Value(launchOp.getBlockIds(), en.index())
: getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
- Value *step = steps[en.index()];
+ ValuePtr step = steps[en.index()];
if (!isConstantOne(step))
id = builder.create<MulIOp>(rootForOp.getLoc(), step, id);
- Value *ivReplacement =
+ ValuePtr ivReplacement =
builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
en.value()->replaceAllUsesWith(ivReplacement);
replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt,
@@ -479,8 +482,8 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
// trailing positions, make sure we don't touch those.
for (const auto &pair :
llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
- Value *from = std::get<0>(pair);
- Value *to = std::get<1>(pair);
+ ValuePtr from = std::get<0>(pair);
+ ValuePtr to = std::get<1>(pair);
replaceAllUsesInRegionWith(from, to, launchOp.body());
}
@@ -510,8 +513,8 @@ static LogicalResult convertLoopNestToGPULaunch(OpTy forOp,
// nested. The workgroup size and num workgroups is provided as input
template <typename OpTy>
static LogicalResult convertLoopToGPULaunch(OpTy forOp,
- ArrayRef<Value *> numWorkGroups,
- ArrayRef<Value *> workGroupSize) {
+ ArrayRef<ValuePtr> numWorkGroups,
+ ArrayRef<ValuePtr> workGroupSize) {
if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(),
workGroupSize.size()))) {
return failure();
@@ -532,7 +535,7 @@ LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp,
}
LogicalResult mlir::convertLoopToGPULaunch(loop::ForOp forOp,
- ArrayRef<Value *> numWorkGroups,
- ArrayRef<Value *> workGroupSizes) {
+ ArrayRef<ValuePtr> numWorkGroups,
+ ArrayRef<ValuePtr> workGroupSizes) {
return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes);
}
OpenPOWER on IntegriCloud