diff options
Diffstat (limited to 'mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp')
-rw-r--r-- | mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp | 111 |
1 files changed, 57 insertions, 54 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp index d663ae105f2..3cbce7caa76 100644 --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -43,7 +43,7 @@ using namespace mlir::loop; using llvm::seq; // Extract an indexed value from KernelDim3. -static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { +static ValuePtr getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { switch (pos) { case 0: return dim3.x; @@ -61,8 +61,8 @@ static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { return forOp.getLowerBoundOperands(); } -static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) { - SmallVector<Value *, 1> bounds(1, forOp.lowerBound()); +static SmallVector<ValuePtr, 1> getLowerBoundOperands(ForOp forOp) { + SmallVector<ValuePtr, 1> bounds(1, forOp.lowerBound()); return bounds; } @@ -70,33 +70,35 @@ static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) { static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { return forOp.getUpperBoundOperands(); } -static SmallVector<Value *, 1> getUpperBoundOperands(ForOp forOp) { - SmallVector<Value *, 1> bounds(1, forOp.upperBound()); +static SmallVector<ValuePtr, 1> getUpperBoundOperands(ForOp forOp) { + SmallVector<ValuePtr, 1> bounds(1, forOp.upperBound()); return bounds; } // Get a Value that corresponds to the loop step. If the step is an attribute, // materialize a corresponding constant using builder. -static Value *getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { +static ValuePtr getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep()); } -static Value *getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); } +static ValuePtr getOrCreateStep(ForOp forOp, OpBuilder &) { + return forOp.step(); +} // Get a Value for the loop lower bound. If the value requires computation, // materialize the instructions using builder. -static Value *getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { +static ValuePtr getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { return lowerAffineLowerBound(forOp, builder); } -static Value *getOrEmitLowerBound(ForOp forOp, OpBuilder &) { +static ValuePtr getOrEmitLowerBound(ForOp forOp, OpBuilder &) { return forOp.lowerBound(); } // Get a Value for the loop upper bound. If the value requires computation, // materialize the instructions using builder. -static Value *getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { +static ValuePtr getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { return lowerAffineUpperBound(forOp, builder); } -static Value *getOrEmitUpperBound(ForOp forOp, OpBuilder &) { +static ValuePtr getOrEmitUpperBound(ForOp forOp, OpBuilder &) { return forOp.upperBound(); } @@ -212,18 +214,18 @@ struct LoopToGpuConverter { unsigned numThreadDims); // Ranges of the loops mapped to blocks or threads. - SmallVector<Value *, 6> dims; + SmallVector<ValuePtr, 6> dims; // Lower bounds of the loops mapped to blocks or threads. - SmallVector<Value *, 6> lbs; + SmallVector<ValuePtr, 6> lbs; // Induction variables of the loops mapped to blocks or threads. - SmallVector<Value *, 6> ivs; + SmallVector<ValuePtr, 6> ivs; // Steps of the loops mapped to blocks or threads. - SmallVector<Value *, 6> steps; + SmallVector<ValuePtr, 6> steps; }; } // namespace // Return true if the value is obviously a constant "one". -static bool isConstantOne(Value *value) { +static bool isConstantOne(ValuePtr value) { if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp())) return def.getValue() == 1; return false; @@ -244,15 +246,15 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp, steps.reserve(numLoops); OpTy currentLoop = forOp; for (unsigned i = 0; i < numLoops; ++i) { - Value *lowerBound = getOrEmitLowerBound(currentLoop, builder); - Value *upperBound = getOrEmitUpperBound(currentLoop, builder); + ValuePtr lowerBound = getOrEmitLowerBound(currentLoop, builder); + ValuePtr upperBound = getOrEmitUpperBound(currentLoop, builder); if (!lowerBound || !upperBound) { return llvm::None; } - Value *range = + ValuePtr range = builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound); - Value *step = getOrCreateStep(currentLoop, builder); + ValuePtr step = getOrCreateStep(currentLoop, builder); if (!isConstantOne(step)) range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step); dims.push_back(range); @@ -274,8 +276,8 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp, /// `nids`. The innermost loop is mapped to the x-dimension, followed by the /// next innermost loop to y-dimension, followed by z-dimension. template <typename OpTy> -OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value *> ids, - ArrayRef<Value *> nids) { +OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<ValuePtr> ids, + ArrayRef<ValuePtr> nids) { auto nDims = ids.size(); assert(nDims == nids.size()); for (auto dim : llvm::seq<unsigned>(0, nDims)) { @@ -295,11 +297,11 @@ OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value *> ids, /// each workgroup/workitem and number of workgroup/workitems along a dimension /// of the launch into a container. void packIdAndNumId(gpu::KernelDim3 kernelIds, gpu::KernelDim3 kernelNids, - unsigned nDims, SmallVectorImpl<Value *> &ids, - SmallVectorImpl<Value *> &nids) { + unsigned nDims, SmallVectorImpl<ValuePtr> &ids, + SmallVectorImpl<ValuePtr> &nids) { assert(nDims <= 3 && "invalid number of launch dimensions"); - SmallVector<Value *, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x}; - SmallVector<Value *, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x}; + SmallVector<ValuePtr, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x}; + SmallVector<ValuePtr, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x}; ids.clear(); ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end()); nids.clear(); @@ -317,7 +319,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp, auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc()); rootForOp.getOperation()->moveBefore(returnOp); - SmallVector<Value *, 3> workgroupID, numWorkGroups; + SmallVector<ValuePtr, 3> workgroupID, numWorkGroups; packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims, workgroupID, numWorkGroups); @@ -333,7 +335,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp, } } - SmallVector<Value *, 3> workItemID, workGroupSize; + SmallVector<ValuePtr, 3> workItemID, workGroupSize; packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(), numThreadDims, workItemID, workGroupSize); for (auto &loopOp : threadRootForOps) { @@ -347,17 +349,17 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp, // given workgroup size and number of workgroups. template <typename OpTy> LogicalResult createLaunchFromOp(OpTy rootForOp, - ArrayRef<Value *> numWorkGroups, - ArrayRef<Value *> workGroupSizes) { + ArrayRef<ValuePtr> numWorkGroups, + ArrayRef<ValuePtr> workGroupSizes) { OpBuilder builder(rootForOp.getOperation()); if (numWorkGroups.size() > 3) { return rootForOp.emitError("invalid ") << numWorkGroups.size() << "-D workgroup specification"; } auto loc = rootForOp.getLoc(); - Value *one = builder.create<ConstantOp>( + ValuePtr one = builder.create<ConstantOp>( loc, builder.getIntegerAttr(builder.getIndexType(), 1)); - SmallVector<Value *, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one); + SmallVector<ValuePtr, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one); for (auto numWorkGroup : enumerate(numWorkGroups)) { numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value(); } @@ -367,7 +369,7 @@ LogicalResult createLaunchFromOp(OpTy rootForOp, // Get the values used within the region of the rootForOp but defined above // it. - llvm::SetVector<Value *> valuesToForwardSet; + llvm::SetVector<ValuePtr> valuesToForwardSet; getUsedValuesDefinedAbove(rootForOp.region(), rootForOp.region(), valuesToForwardSet); // Also add the values used for the lb, ub, and step of the rootForOp. @@ -387,8 +389,8 @@ LogicalResult createLaunchFromOp(OpTy rootForOp, // defined outside. They all are replaced with kernel arguments. for (const auto &pair : llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) { - Value *from = std::get<0>(pair); - Value *to = std::get<1>(pair); + ValuePtr from = std::get<0>(pair); + ValuePtr to = std::get<1>(pair); replaceAllUsesInRegionWith(from, to, launchOp.body()); } return success(); @@ -408,22 +410,23 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, OpBuilder builder(rootForOp.getOperation()); // Prepare the grid and block sizes for the launch operation. If there is // no loop mapped to a specific dimension, use constant "1" as its size. - Value *constOne = (numBlockDims < 3 || numThreadDims < 3) - ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) - : nullptr; - Value *gridSizeX = dims[0]; - Value *gridSizeY = numBlockDims > 1 ? dims[1] : constOne; - Value *gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; - Value *blockSizeX = dims[numBlockDims]; - Value *blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; - Value *blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; + ValuePtr constOne = + (numBlockDims < 3 || numThreadDims < 3) + ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) + : nullptr; + ValuePtr gridSizeX = dims[0]; + ValuePtr gridSizeY = numBlockDims > 1 ? dims[1] : constOne; + ValuePtr gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; + ValuePtr blockSizeX = dims[numBlockDims]; + ValuePtr blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; + ValuePtr blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; // Create a launch op and move the body region of the innermost loop to the // launch op. Pass the values defined outside the outermost loop and used // inside the innermost loop and loop lower bounds as kernel data arguments. // Still assuming perfect nesting so there are no values other than induction // variables that are defined in one loop and used in deeper loops. - llvm::SetVector<Value *> valuesToForwardSet; + llvm::SetVector<ValuePtr> valuesToForwardSet; getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(), valuesToForwardSet); auto valuesToForward = valuesToForwardSet.takeVector(); @@ -457,15 +460,15 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, originallyForwardedValues); auto stepArgumentIt = std::next(lbArgumentIt, lbs.size()); for (auto en : llvm::enumerate(ivs)) { - Value *id = + ValuePtr id = en.index() < numBlockDims ? getDim3Value(launchOp.getBlockIds(), en.index()) : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); - Value *step = steps[en.index()]; + ValuePtr step = steps[en.index()]; if (!isConstantOne(step)) id = builder.create<MulIOp>(rootForOp.getLoc(), step, id); - Value *ivReplacement = + ValuePtr ivReplacement = builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); en.value()->replaceAllUsesWith(ivReplacement); replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt, @@ -479,8 +482,8 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, // trailing positions, make sure we don't touch those. for (const auto &pair : llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) { - Value *from = std::get<0>(pair); - Value *to = std::get<1>(pair); + ValuePtr from = std::get<0>(pair); + ValuePtr to = std::get<1>(pair); replaceAllUsesInRegionWith(from, to, launchOp.body()); } @@ -510,8 +513,8 @@ static LogicalResult convertLoopNestToGPULaunch(OpTy forOp, // nested. The workgroup size and num workgroups is provided as input template <typename OpTy> static LogicalResult convertLoopToGPULaunch(OpTy forOp, - ArrayRef<Value *> numWorkGroups, - ArrayRef<Value *> workGroupSize) { + ArrayRef<ValuePtr> numWorkGroups, + ArrayRef<ValuePtr> workGroupSize) { if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(), workGroupSize.size()))) { return failure(); @@ -532,7 +535,7 @@ LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp, } LogicalResult mlir::convertLoopToGPULaunch(loop::ForOp forOp, - ArrayRef<Value *> numWorkGroups, - ArrayRef<Value *> workGroupSizes) { + ArrayRef<ValuePtr> numWorkGroups, + ArrayRef<ValuePtr> workGroupSizes) { return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes); } |