diff options
author | River Riddle <riverriddle@google.com> | 2019-12-23 14:45:01 -0800 |
---|---|---|
committer | A. Unique TensorFlower <gardener@tensorflow.org> | 2019-12-23 16:36:53 -0800 |
commit | e62a69561fb9d7b1013d2853da68d79a7907fead (patch) | |
tree | 0dd059094cbfb8d904513abcdc1fbe8cfa89bb09 /mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp | |
parent | 5d5bd2e1da29d976cb125dbb3cd097a5e42b2be4 (diff) | |
download | bcm5719-llvm-e62a69561fb9d7b1013d2853da68d79a7907fead.tar.gz bcm5719-llvm-e62a69561fb9d7b1013d2853da68d79a7907fead.zip |
NFC: Replace ValuePtr with Value and remove it now that Value is value-typed.
ValuePtr was a temporary typedef during the transition to a value-typed Value.
PiperOrigin-RevId: 286945714
Diffstat (limited to 'mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp')
-rw-r--r-- | mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp | 112 |
1 files changed, 54 insertions, 58 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp index 24bb8ffc462..e500d10983c 100644 --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -34,7 +34,7 @@ using namespace mlir::loop; using llvm::seq; // Extract an indexed value from KernelDim3. -static ValuePtr getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { +static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { switch (pos) { case 0: return dim3.x; @@ -52,8 +52,8 @@ static ValuePtr getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { return forOp.getLowerBoundOperands(); } -static SmallVector<ValuePtr, 1> getLowerBoundOperands(ForOp forOp) { - SmallVector<ValuePtr, 1> bounds(1, forOp.lowerBound()); +static SmallVector<Value, 1> getLowerBoundOperands(ForOp forOp) { + SmallVector<Value, 1> bounds(1, forOp.lowerBound()); return bounds; } @@ -61,35 +61,33 @@ static SmallVector<ValuePtr, 1> getLowerBoundOperands(ForOp forOp) { static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { return forOp.getUpperBoundOperands(); } -static SmallVector<ValuePtr, 1> getUpperBoundOperands(ForOp forOp) { - SmallVector<ValuePtr, 1> bounds(1, forOp.upperBound()); +static SmallVector<Value, 1> getUpperBoundOperands(ForOp forOp) { + SmallVector<Value, 1> bounds(1, forOp.upperBound()); return bounds; } // Get a Value that corresponds to the loop step. If the step is an attribute, // materialize a corresponding constant using builder. -static ValuePtr getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { +static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep()); } -static ValuePtr getOrCreateStep(ForOp forOp, OpBuilder &) { - return forOp.step(); -} +static Value getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); } // Get a Value for the loop lower bound. If the value requires computation, // materialize the instructions using builder. -static ValuePtr getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { +static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { return lowerAffineLowerBound(forOp, builder); } -static ValuePtr getOrEmitLowerBound(ForOp forOp, OpBuilder &) { +static Value getOrEmitLowerBound(ForOp forOp, OpBuilder &) { return forOp.lowerBound(); } // Get a Value for the loop upper bound. If the value requires computation, // materialize the instructions using builder. -static ValuePtr getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { +static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { return lowerAffineUpperBound(forOp, builder); } -static ValuePtr getOrEmitUpperBound(ForOp forOp, OpBuilder &) { +static Value getOrEmitUpperBound(ForOp forOp, OpBuilder &) { return forOp.upperBound(); } @@ -205,18 +203,18 @@ struct LoopToGpuConverter { unsigned numThreadDims); // Ranges of the loops mapped to blocks or threads. - SmallVector<ValuePtr, 6> dims; + SmallVector<Value, 6> dims; // Lower bounds of the loops mapped to blocks or threads. - SmallVector<ValuePtr, 6> lbs; + SmallVector<Value, 6> lbs; // Induction variables of the loops mapped to blocks or threads. - SmallVector<ValuePtr, 6> ivs; + SmallVector<Value, 6> ivs; // Steps of the loops mapped to blocks or threads. - SmallVector<ValuePtr, 6> steps; + SmallVector<Value, 6> steps; }; } // namespace // Return true if the value is obviously a constant "one". -static bool isConstantOne(ValuePtr value) { +static bool isConstantOne(Value value) { if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp())) return def.getValue() == 1; return false; @@ -237,15 +235,15 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp, steps.reserve(numLoops); OpTy currentLoop = forOp; for (unsigned i = 0; i < numLoops; ++i) { - ValuePtr lowerBound = getOrEmitLowerBound(currentLoop, builder); - ValuePtr upperBound = getOrEmitUpperBound(currentLoop, builder); + Value lowerBound = getOrEmitLowerBound(currentLoop, builder); + Value upperBound = getOrEmitUpperBound(currentLoop, builder); if (!lowerBound || !upperBound) { return llvm::None; } - ValuePtr range = + Value range = builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound); - ValuePtr step = getOrCreateStep(currentLoop, builder); + Value step = getOrCreateStep(currentLoop, builder); if (!isConstantOne(step)) range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step); dims.push_back(range); @@ -267,8 +265,8 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp, /// `nids`. The innermost loop is mapped to the x-dimension, followed by the /// next innermost loop to y-dimension, followed by z-dimension. template <typename OpTy> -OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<ValuePtr> ids, - ArrayRef<ValuePtr> nids) { +OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value> ids, + ArrayRef<Value> nids) { auto nDims = ids.size(); assert(nDims == nids.size()); for (auto dim : llvm::seq<unsigned>(0, nDims)) { @@ -288,11 +286,11 @@ OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<ValuePtr> ids, /// each workgroup/workitem and number of workgroup/workitems along a dimension /// of the launch into a container. void packIdAndNumId(gpu::KernelDim3 kernelIds, gpu::KernelDim3 kernelNids, - unsigned nDims, SmallVectorImpl<ValuePtr> &ids, - SmallVectorImpl<ValuePtr> &nids) { + unsigned nDims, SmallVectorImpl<Value> &ids, + SmallVectorImpl<Value> &nids) { assert(nDims <= 3 && "invalid number of launch dimensions"); - SmallVector<ValuePtr, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x}; - SmallVector<ValuePtr, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x}; + SmallVector<Value, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x}; + SmallVector<Value, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x}; ids.clear(); ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end()); nids.clear(); @@ -310,7 +308,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp, auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc()); rootForOp.getOperation()->moveBefore(returnOp); - SmallVector<ValuePtr, 3> workgroupID, numWorkGroups; + SmallVector<Value, 3> workgroupID, numWorkGroups; packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims, workgroupID, numWorkGroups); @@ -326,7 +324,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp, } } - SmallVector<ValuePtr, 3> workItemID, workGroupSize; + SmallVector<Value, 3> workItemID, workGroupSize; packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(), numThreadDims, workItemID, workGroupSize); for (auto &loopOp : threadRootForOps) { @@ -339,18 +337,17 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp, // Convert the computation rooted at the `rootForOp`, into a GPU kernel with the // given workgroup size and number of workgroups. template <typename OpTy> -LogicalResult createLaunchFromOp(OpTy rootForOp, - ArrayRef<ValuePtr> numWorkGroups, - ArrayRef<ValuePtr> workGroupSizes) { +LogicalResult createLaunchFromOp(OpTy rootForOp, ArrayRef<Value> numWorkGroups, + ArrayRef<Value> workGroupSizes) { OpBuilder builder(rootForOp.getOperation()); if (numWorkGroups.size() > 3) { return rootForOp.emitError("invalid ") << numWorkGroups.size() << "-D workgroup specification"; } auto loc = rootForOp.getLoc(); - ValuePtr one = builder.create<ConstantOp>( + Value one = builder.create<ConstantOp>( loc, builder.getIntegerAttr(builder.getIndexType(), 1)); - SmallVector<ValuePtr, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one); + SmallVector<Value, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one); for (auto numWorkGroup : enumerate(numWorkGroups)) { numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value(); } @@ -360,7 +357,7 @@ LogicalResult createLaunchFromOp(OpTy rootForOp, // Get the values used within the region of the rootForOp but defined above // it. - llvm::SetVector<ValuePtr> valuesToForwardSet; + llvm::SetVector<Value> valuesToForwardSet; getUsedValuesDefinedAbove(rootForOp.region(), rootForOp.region(), valuesToForwardSet); // Also add the values used for the lb, ub, and step of the rootForOp. @@ -380,8 +377,8 @@ LogicalResult createLaunchFromOp(OpTy rootForOp, // defined outside. They all are replaced with kernel arguments. for (const auto &pair : llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) { - ValuePtr from = std::get<0>(pair); - ValuePtr to = std::get<1>(pair); + Value from = std::get<0>(pair); + Value to = std::get<1>(pair); replaceAllUsesInRegionWith(from, to, launchOp.body()); } return success(); @@ -401,23 +398,22 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, OpBuilder builder(rootForOp.getOperation()); // Prepare the grid and block sizes for the launch operation. If there is // no loop mapped to a specific dimension, use constant "1" as its size. - ValuePtr constOne = - (numBlockDims < 3 || numThreadDims < 3) - ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) - : nullptr; - ValuePtr gridSizeX = dims[0]; - ValuePtr gridSizeY = numBlockDims > 1 ? dims[1] : constOne; - ValuePtr gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; - ValuePtr blockSizeX = dims[numBlockDims]; - ValuePtr blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; - ValuePtr blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; + Value constOne = (numBlockDims < 3 || numThreadDims < 3) + ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1) + : nullptr; + Value gridSizeX = dims[0]; + Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; + Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne; + Value blockSizeX = dims[numBlockDims]; + Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne; + Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne; // Create a launch op and move the body region of the innermost loop to the // launch op. Pass the values defined outside the outermost loop and used // inside the innermost loop and loop lower bounds as kernel data arguments. // Still assuming perfect nesting so there are no values other than induction // variables that are defined in one loop and used in deeper loops. - llvm::SetVector<ValuePtr> valuesToForwardSet; + llvm::SetVector<Value> valuesToForwardSet; getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(), valuesToForwardSet); auto valuesToForward = valuesToForwardSet.takeVector(); @@ -451,15 +447,15 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, originallyForwardedValues); auto stepArgumentIt = std::next(lbArgumentIt, lbs.size()); for (auto en : llvm::enumerate(ivs)) { - ValuePtr id = + Value id = en.index() < numBlockDims ? getDim3Value(launchOp.getBlockIds(), en.index()) : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); - ValuePtr step = steps[en.index()]; + Value step = steps[en.index()]; if (!isConstantOne(step)) id = builder.create<MulIOp>(rootForOp.getLoc(), step, id); - ValuePtr ivReplacement = + Value ivReplacement = builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); en.value()->replaceAllUsesWith(ivReplacement); replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt, @@ -473,8 +469,8 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp, // trailing positions, make sure we don't touch those. for (const auto &pair : llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) { - ValuePtr from = std::get<0>(pair); - ValuePtr to = std::get<1>(pair); + Value from = std::get<0>(pair); + Value to = std::get<1>(pair); replaceAllUsesInRegionWith(from, to, launchOp.body()); } @@ -504,8 +500,8 @@ static LogicalResult convertLoopNestToGPULaunch(OpTy forOp, // nested. The workgroup size and num workgroups is provided as input template <typename OpTy> static LogicalResult convertLoopToGPULaunch(OpTy forOp, - ArrayRef<ValuePtr> numWorkGroups, - ArrayRef<ValuePtr> workGroupSize) { + ArrayRef<Value> numWorkGroups, + ArrayRef<Value> workGroupSize) { if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(), workGroupSize.size()))) { return failure(); @@ -526,7 +522,7 @@ LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp, } LogicalResult mlir::convertLoopToGPULaunch(loop::ForOp forOp, - ArrayRef<ValuePtr> numWorkGroups, - ArrayRef<ValuePtr> workGroupSizes) { + ArrayRef<Value> numWorkGroups, + ArrayRef<Value> workGroupSizes) { return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes); } |