diff options
| author | Stephan Herhut <herhut@google.com> | 2019-09-04 06:15:40 -0700 |
|---|---|---|
| committer | A. Unique TensorFlower <gardener@tensorflow.org> | 2019-09-04 06:16:07 -0700 |
| commit | dfd06af562e93e767000e96c436846caea847c38 (patch) | |
| tree | ab4cf40cb2440e87fff5447663718a7fcf43741b /mlir/lib/Dialect/GPU/Transforms | |
| parent | 2f13df13b0bb481702fc83eb50c273deadb55f20 (diff) | |
| download | bcm5719-llvm-dfd06af562e93e767000e96c436846caea847c38.tar.gz bcm5719-llvm-dfd06af562e93e767000e96c436846caea847c38.zip | |
Make GPU kernel outlining inline constants.
It is generally beneficial to pass less arguments to a kernel, so cloning constants
into the kernel is beneficial.
PiperOrigin-RevId: 267139084
Diffstat (limited to 'mlir/lib/Dialect/GPU/Transforms')
| -rw-r--r-- | mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp | 42 |
1 files changed, 40 insertions, 2 deletions
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index dae8ae8ec55..f464b091fc8 100644 --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -56,6 +56,42 @@ static void injectGpuIndexOperations(Location loc, FuncOp kernelFunc) { } } +// Move all constant arguments of the given kernel function into the function, +// thereby reducing the number of kernel arguments. +static gpu::LaunchFuncOp inlineConstants(FuncOp kernelFunc, + gpu::LaunchFuncOp launch) { + OpBuilder kernelBuilder(kernelFunc.getBody()); + auto &firstBlock = kernelFunc.getBody().front(); + llvm::SmallVector<Value *, 8> newLaunchArgs; + for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) { + auto operandOp = launch.getKernelOperand(i)->getDefiningOp(); + auto constant = dyn_cast_or_null<ConstantOp>(operandOp); + if (!constant) { + newLaunchArgs.push_back(launch.getKernelOperand(i)); + continue; + } + auto newConstant = kernelBuilder.clone(*operandOp); + firstBlock.getArgument(i)->replaceAllUsesWith(newConstant->getResult(0)); + firstBlock.eraseArgument(i); + } + if (newLaunchArgs.size() != launch.getNumKernelOperands()) { + std::reverse(newLaunchArgs.begin(), newLaunchArgs.end()); + OpBuilder LaunchBuilder(launch); + SmallVector<Type, 8> newArgumentTypes; + newArgumentTypes.reserve(firstBlock.getNumArguments()); + for (auto value : firstBlock.getArguments()) { + newArgumentTypes.push_back(value->getType()); + } + kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {})); + auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>( + launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(), + launch.getBlockSizeOperandValues(), newLaunchArgs); + launch.erase(); + return newLaunch; + } + return launch; +} + // Outline the `gpu.launch` operation body into a kernel function. Replace // `gpu.return` operations by `std.return` in the generated functions. static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) { @@ -80,14 +116,16 @@ static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) { } // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching -// `kernelFunc`. +// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with +// constant region arguments inlined. static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, FuncOp kernelFunc) { OpBuilder builder(launchOp); SmallVector<Value *, 4> kernelOperandValues( launchOp.getKernelOperandValues()); - builder.create<gpu::LaunchFuncOp>( + auto launchFuncOp = builder.create<gpu::LaunchFuncOp>( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), kernelOperandValues); + inlineConstants(kernelFunc, launchFuncOp); launchOp.erase(); } |

