diff options
author | Tres Popp <tpopp@google.com> | 2020-01-14 11:09:59 +0100 |
---|---|---|
committer | Stephan Herhut <herhut@google.com> | 2020-01-14 12:05:47 +0100 |
commit | 4624a1e8ac8a3f69cc887403b976f538f587744a (patch) | |
tree | 13cb3b1371abedefbdbd7e09933633acc4aca44c | |
parent | 9492e9d8cfd356109276da5aa926b297db0e16db (diff) | |
download | bcm5719-llvm-4624a1e8ac8a3f69cc887403b976f538f587744a.tar.gz bcm5719-llvm-4624a1e8ac8a3f69cc887403b976f538f587744a.zip |
[mlir] Create a gpu.module operation for the GPU Dialect.
Summary:
This is based on the use of code constantly checking for an attribute on
a model and instead represents the distinct operaion with a different
op. Instead, this op can be used to provide better filtering.
Reviewers: herhut, mravishankar, antiagainst, rriddle
Reviewed By: herhut, antiagainst, rriddle
Subscribers: liufengdb, aartbik, jholewinski, mgorny, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, csigg, arpith-jacob, mgester, lucyrfox, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D72336
24 files changed, 235 insertions, 140 deletions
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h index 4eb6379adf6..f61e40ef5f9 100644 --- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h +++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h @@ -19,12 +19,17 @@ namespace mlir { class Location; class ModuleOp; +template <typename T> +class OpPassBase; + +namespace gpu { +class GPUModuleOp; +} // namespace gpu + namespace LLVM { class LLVMDialect; } // namespace LLVM -template <typename T> class OpPassBase; - using OwnedCubin = std::unique_ptr<std::vector<char>>; using CubinGenerator = std::function<OwnedCubin(const std::string &, Location, StringRef)>; @@ -38,7 +43,7 @@ using CubinGenerator = /// attached as a string attribute named 'nvvm.cubin' to the kernel function. /// After the transformation, the body of the kernel function is removed (i.e., /// it is turned into a declaration). -std::unique_ptr<OpPassBase<ModuleOp>> +std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator); /// Creates a pass to convert a gpu.launch_func operation into a sequence of diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h index 75e4f7e374c..b3212279fab 100644 --- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h +++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h @@ -14,15 +14,19 @@ namespace mlir { class LLVMTypeConverter; class OwningRewritePatternList; -class ModuleOp; -template <typename OpT> class OpPassBase; +template <typename OpT> +class OpPassBase; + +namespace gpu { +class GPUModuleOp; +} /// Collect a set of patterns to convert from the GPU dialect to NVVM. void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, OwningRewritePatternList &patterns); /// Creates a pass that lowers GPU dialect operations to NVVM counterparts. -std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass(); +std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass(); } // namespace mlir diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index 766ddbf202c..3df6ff4be0c 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -588,4 +588,56 @@ def GPU_BarrierOp : GPU_Op<"barrier"> { let printer = [{ p << getOperationName(); }]; } +def GPU_GPUModuleOp : GPU_Op<"module", [ + IsolatedFromAbove, SymbolTable, Symbol, + SingleBlockImplicitTerminator<"ModuleEndOp"> +]> { + let summary = "A top level compilation unit containing code to be run on a GPU."; + let description = [{ + GPU module contains code that is intended to be run on a GPU. A host device + can launch this code through a gpu.launc_func that creates a fully + qualified symbol through the gpu.module's symbol and a gpu.func symbol + contained in the gpu.module. + + The module's top-level scope is modeled by a single region with a single + block. GPU modules are required to have a name that is used for symbol + resolution by the gpu.launch_func operation. + + Using an op with a region to define a GPU module enables "embedding" GPU + modules with SIMT execution models in other dialects in a clean manner and + allows filtering of code regions to execute passes on only code intended to + or not intended to be run on the separate device. + + ``` + gpu.module @symbol_name { + gpu.func {} + ... + gpu.module_end + } + + ``` + }]; + let builders = [OpBuilder<"Builder *builder, OperationState &result, " + "StringRef name">]; + let parser = [{ return ::parseGPUModuleOp(parser, result); }]; + let printer = [{ return ::print(p, *this); }]; + let regions = (region SizedRegion<1>:$body); + + // We need to ensure the block inside the region is properly terminated; + // the auto-generated builders do not guarantee that. + let skipDefaultBuilders = 1; +} + +def GPU_ModuleEndOp : GPU_Op<"module_end", [ + Terminator, HasParent<"GPUModuleOp"> +]> { + let summary = "A pseudo op that marks the end of a gpu.module."; + let description = [{ + This op terminates the only block inside the only region of a `gpu.module`. + }]; + + let parser = [{ return success(); }]; + let printer = [{ p << getOperationName(); }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp index 66a2e66f99a..b111c96313c 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp @@ -46,18 +46,15 @@ static constexpr const char *kCubinAnnotation = "nvvm.cubin"; /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to /// GPU binary code, which is then attached as an attribute to the function. The /// function body is erased. -class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> { +class GpuKernelToCubinPass + : public OperationPass<GpuKernelToCubinPass, gpu::GPUModuleOp> { public: GpuKernelToCubinPass( CubinGenerator cubinGenerator = compilePtxToCubinForTesting) : cubinGenerator(cubinGenerator) {} - void runOnModule() override { - ModuleOp module = getModule(); - if (!module.getAttrOfType<UnitAttr>( - gpu::GPUDialect::getKernelModuleAttrName()) || - !module.getName()) - return; + void runOnOperation() override { + gpu::GPUModuleOp module = getOperation(); // Make sure the NVPTX target is initialized. LLVMInitializeNVPTXTarget(); @@ -71,8 +68,8 @@ public: // Translate the module to CUBIN and attach the result as attribute to the // module. - if (auto cubinAttr = translateGpuModuleToCubinAnnotation( - *llvmModule, module.getLoc(), *module.getName())) + if (auto cubinAttr = translateGPUModuleToCubinAnnotation( + *llvmModule, module.getLoc(), module.getName())) module.setAttr(kCubinAnnotation, cubinAttr); else signalPassFailure(); @@ -92,7 +89,7 @@ private: StringRef name); /// Translates llvmModule to cubin and returns the result as attribute. - StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule, + StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule, Location loc, StringRef name); CubinGenerator cubinGenerator; @@ -149,7 +146,7 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule, return cubinGenerator(ptx, loc, name); } -StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation( +StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation( llvm::Module &llvmModule, Location loc, StringRef name) { auto cubin = convertModuleToCubin(llvmModule, loc, name); if (!cubin) @@ -157,7 +154,7 @@ StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation( return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext()); } -std::unique_ptr<OpPassBase<ModuleOp>> +std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) { return std::make_unique<GpuKernelToCubinPass>(cubinGenerator); } diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp index 41f69d6e21d..31024d2881b 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -132,9 +132,9 @@ public: // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN data. - for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>())) - if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName())) - m.erase(); + for (auto m : + llvm::make_early_inc_range(getModule().getOps<gpu::GPUModuleOp>())) + m.erase(); } private: @@ -343,8 +343,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( builder.getI32IntegerAttr(0)); // Create an LLVM global with CUBIN extracted from the kernel annotation and // obtain a pointer to the first byte in it. - auto kernelModule = - getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName()); + auto kernelModule = getModule().lookupSymbol<gpu::GPUModuleOp>( + launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation); @@ -354,8 +354,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( return signalPassFailure(); } - assert(kernelModule.getName() && "expected a named module"); - SmallString<128> nameBuffer(*kernelModule.getName()); + SmallString<128> nameBuffer(kernelModule.getName()); nameBuffer.append(kCubinStorageSuffix); Value data = LLVM::createGlobalString( loc, builder, nameBuffer.str(), cubinAttr.getValue(), diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index e2b1e0e533c..84bc7ff1d5f 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -200,7 +200,7 @@ private: auto type = operand.getType().cast<LLVM::LLVMType>(); // Create shared memory array to store the warp reduction. - auto module = operand.getDefiningOp()->getParentOfType<ModuleOp>(); + auto module = operand.getDefiningOp()->getParentOfType<gpu::GPUModuleOp>(); assert(module && "op must belong to a module"); Value sharedMemPtr = createSharedMemoryArray(loc, module, type, kWarpSize, rewriter); @@ -391,10 +391,10 @@ private: } /// Creates a global array stored in shared memory. - Value createSharedMemoryArray(Location loc, ModuleOp module, + Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module, LLVM::LLVMType elementType, int numElements, ConversionPatternRewriter &rewriter) const { - OpBuilder builder(module.getBodyRegion()); + OpBuilder builder(module.body()); auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); StringRef name = "reduce_buffer"; @@ -699,13 +699,11 @@ struct GPUReturnOpLowering : public LLVMOpLowering { /// /// This pass only handles device code and is not meant to be run on GPU host /// code. -class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> { +class LowerGpuOpsToNVVMOpsPass + : public OperationPass<LowerGpuOpsToNVVMOpsPass, gpu::GPUModuleOp> { public: - void runOnModule() override { - ModuleOp m = getModule(); - if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName())) - return; - + void runOnOperation() override { + gpu::GPUModuleOp m = getOperation(); OwningRewritePatternList patterns; NVVMTypeConverter converter(m.getContext()); populateStdToLLVMConversionPatterns(converter, patterns); @@ -718,7 +716,7 @@ public: target.addLegalDialect<LLVM::LLVMDialect>(); target.addLegalDialect<NVVM::NVVMDialect>(); // TODO(csigg): Remove once we support replacing non-root ops. - target.addLegalOp<gpu::YieldOp>(); + target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>(); if (failed(applyPartialConversion(m, target, patterns, &converter))) signalPassFailure(); } @@ -750,7 +748,8 @@ void mlir::populateGpuToNVVMConversionPatterns( "__nv_exp"); } -std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() { +std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> +mlir::createLowerGpuOpsToNVVMOpsPass() { return std::make_unique<LowerGpuOpsToNVVMOpsPass>(); } diff --git a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt index be82894461d..adeb4e099ab 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt @@ -1,8 +1,15 @@ +set(LLVM_TARGET_DEFINITIONS GPUToSPIRV.td) +mlir_tablegen(GPUToSPIRV.cpp.inc -gen-rewriters) +add_public_tablegen_target(MLIRGPUToSPIRVIncGen) + add_llvm_library(MLIRGPUtoSPIRVTransforms ConvertGPUToSPIRV.cpp ConvertGPUToSPIRVPass.cpp ) +add_dependencies(MLIRGPUtoSPIRVTransforms + MLIRGPUToSPIRVIncGen) + target_link_libraries(MLIRGPUtoSPIRVTransforms MLIRGPU MLIRIR diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp index 2fd8cedfd63..a90cea99be4 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp @@ -63,27 +63,13 @@ private: SmallVector<int32_t, 3> workGroupSizeAsInt32; }; -/// Pattern to convert a module with gpu.kernel_module attribute to a -/// spv.module. -class KernelModuleConversion final : public SPIRVOpLowering<ModuleOp> { +/// Pattern to convert a gpu.module to a spv.module. +class GPUModuleConversion final : public SPIRVOpLowering<gpu::GPUModuleOp> { public: - using SPIRVOpLowering<ModuleOp>::SPIRVOpLowering; + using SPIRVOpLowering<gpu::GPUModuleOp>::SPIRVOpLowering; PatternMatchResult - matchAndRewrite(ModuleOp moduleOp, ArrayRef<Value> operands, - ConversionPatternRewriter &rewriter) const override; -}; - -/// Pattern to convert a module terminator op to a terminator of spv.module op. -// TODO: Move this into DRR, but that requires ModuleTerminatorOp to be defined -// in ODS. -class KernelModuleTerminatorConversion final - : public SPIRVOpLowering<ModuleTerminatorOp> { -public: - using SPIRVOpLowering<ModuleTerminatorOp>::SPIRVOpLowering; - - PatternMatchResult - matchAndRewrite(ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands, + matchAndRewrite(gpu::GPUModuleOp moduleOp, ArrayRef<Value> operands, ConversionPatternRewriter &rewriter) const override; }; @@ -284,16 +270,12 @@ KernelFnConversion::matchAndRewrite(gpu::GPUFuncOp funcOp, } //===----------------------------------------------------------------------===// -// ModuleOp with gpu.kernel_module. +// ModuleOp with gpu.module. //===----------------------------------------------------------------------===// -PatternMatchResult KernelModuleConversion::matchAndRewrite( - ModuleOp moduleOp, ArrayRef<Value> operands, +PatternMatchResult GPUModuleConversion::matchAndRewrite( + gpu::GPUModuleOp moduleOp, ArrayRef<Value> operands, ConversionPatternRewriter &rewriter) const { - if (!moduleOp.getAttrOfType<UnitAttr>( - gpu::GPUDialect::getKernelModuleAttrName())) { - return matchFailure(); - } // TODO : Generalize this to account for different extensions, // capabilities, extended_instruction_sets, other addressing models // and memory models. @@ -302,8 +284,8 @@ PatternMatchResult KernelModuleConversion::matchAndRewrite( spirv::MemoryModel::GLSL450, spirv::Capability::Shader, spirv::Extension::SPV_KHR_storage_buffer_storage_class); // Move the region from the module op into the SPIR-V module. - Region &spvModuleRegion = spvModule.getOperation()->getRegion(0); - rewriter.inlineRegionBefore(moduleOp.getBodyRegion(), spvModuleRegion, + Region &spvModuleRegion = spvModule.body(); + rewriter.inlineRegionBefore(moduleOp.body(), spvModuleRegion, spvModuleRegion.begin()); // The spv.module build method adds a block with a terminator. Remove that // block. The terminator of the module op in the remaining block will be @@ -314,17 +296,6 @@ PatternMatchResult KernelModuleConversion::matchAndRewrite( } //===----------------------------------------------------------------------===// -// ModuleTerminatorOp for gpu.kernel_module. -//===----------------------------------------------------------------------===// - -PatternMatchResult KernelModuleTerminatorConversion::matchAndRewrite( - ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands, - ConversionPatternRewriter &rewriter) const { - rewriter.replaceOpWithNewOp<spirv::ModuleEndOp>(terminatorOp); - return matchSuccess(); -} - -//===----------------------------------------------------------------------===// // GPU return inside kernel functions to SPIR-V return. //===----------------------------------------------------------------------===// @@ -342,14 +313,18 @@ PatternMatchResult GPUReturnOpConversion::matchAndRewrite( // GPU To SPIRV Patterns. //===----------------------------------------------------------------------===// +namespace { +#include "GPUToSPIRV.cpp.inc" +} + void mlir::populateGPUToSPIRVPatterns(MLIRContext *context, SPIRVTypeConverter &typeConverter, OwningRewritePatternList &patterns, ArrayRef<int64_t> workGroupSize) { + populateWithGenerated(context, &patterns); patterns.insert<KernelFnConversion>(context, typeConverter, workGroupSize); patterns.insert< - GPUReturnOpConversion, ForOpConversion, KernelModuleConversion, - KernelModuleTerminatorConversion, + GPUReturnOpConversion, ForOpConversion, GPUModuleConversion, LaunchConfigConversion<gpu::BlockDimOp, spirv::BuiltIn::WorkgroupSize>, LaunchConfigConversion<gpu::BlockIdOp, spirv::BuiltIn::WorkgroupId>, LaunchConfigConversion<gpu::GridDimOp, spirv::BuiltIn::NumWorkgroups>, diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp index 68392c36765..bc8273ec2a9 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp @@ -60,15 +60,12 @@ void GPUToSPIRVPass::runOnModule() { SmallVector<Operation *, 1> kernelModules; OpBuilder builder(context); - module.walk([&builder, &kernelModules](ModuleOp moduleOp) { - if (moduleOp.getAttrOfType<UnitAttr>( - gpu::GPUDialect::getKernelModuleAttrName())) { - // For each kernel module (should be only 1 for now, but that is not a - // requirement here), clone the module for conversion because the - // gpu.launch function still needs the kernel module. - builder.setInsertionPoint(moduleOp.getOperation()); - kernelModules.push_back(builder.clone(*moduleOp.getOperation())); - } + module.walk([&builder, &kernelModules](gpu::GPUModuleOp moduleOp) { + // For each kernel module (should be only 1 for now, but that is not a + // requirement here), clone the module for conversion because the + // gpu.launch function still needs the kernel module. + builder.setInsertionPoint(moduleOp.getOperation()); + kernelModules.push_back(builder.clone(*moduleOp.getOperation())); }); SPIRVTypeConverter typeConverter; diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td new file mode 100644 index 00000000000..cfe9d26273c --- /dev/null +++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td @@ -0,0 +1,22 @@ +//===-- GPUToSPIRV.td - GPU to SPIR-V Dialect Lowerings ----*- tablegen -*-===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains patterns to lower GPU dialect ops to to SPIR-V ops. +// +//===----------------------------------------------------------------------===// + + +#ifndef CONVERT_GPU_TO_SPIRV +#define CONVERT_GPU_TO_SPIRV + +include "mlir/Dialect/GPU/GPUOps.td" +include "mlir/Dialect/SPIRV/SPIRVStructureOps.td" + +def : Pat<(GPU_ModuleEndOp), (SPV_ModuleEndOp)>; + +#endif // CONVERT_GPU_TO_SPIRV diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index e750d0fefff..dbca1fb003a 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -72,15 +72,10 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op, // Check that `launch_func` refers to a well-formed GPU kernel module. StringRef kernelModuleName = launchOp.getKernelModuleName(); - auto kernelModule = module.lookupSymbol<ModuleOp>(kernelModuleName); + auto kernelModule = module.lookupSymbol<GPUModuleOp>(kernelModuleName); if (!kernelModule) return launchOp.emitOpError() << "kernel module '" << kernelModuleName << "' is undefined"; - if (!kernelModule.getAttrOfType<UnitAttr>( - GPUDialect::getKernelModuleAttrName())) - return launchOp.emitOpError("module '") - << kernelModuleName << "' is missing the '" - << GPUDialect::getKernelModuleAttrName() << "' attribute"; // Check that `launch_func` refers to a well-formed kernel function. StringRef kernelName = launchOp.kernel(); @@ -517,10 +512,9 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result, result.addOperands(kernelOperands); result.addAttribute(getKernelAttrName(), builder->getStringAttr(kernelFunc.getName())); - auto kernelModule = kernelFunc.getParentOfType<ModuleOp>(); - if (Optional<StringRef> kernelModuleName = kernelModule.getName()) - result.addAttribute(getKernelModuleAttrName(), - builder->getSymbolRefAttr(*kernelModuleName)); + auto kernelModule = kernelFunc.getParentOfType<GPUModuleOp>(); + result.addAttribute(getKernelModuleAttrName(), + builder->getSymbolRefAttr(kernelModule.getName())); } void LaunchFuncOp::build(Builder *builder, OperationState &result, @@ -820,6 +814,47 @@ LogicalResult GPUFuncOp::verifyBody() { return success(); } +//===----------------------------------------------------------------------===// +// GPUModuleOp +//===----------------------------------------------------------------------===// + +void GPUModuleOp::build(Builder *builder, OperationState &result, + StringRef name) { + ensureTerminator(*result.addRegion(), *builder, result.location); + result.attributes.push_back(builder->getNamedAttr( + ::mlir::SymbolTable::getSymbolAttrName(), builder->getStringAttr(name))); +} + +static ParseResult parseGPUModuleOp(OpAsmParser &parser, + OperationState &result) { + StringAttr nameAttr; + if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(), + result.attributes)) + return failure(); + + // If module attributes are present, parse them. + if (parser.parseOptionalAttrDictWithKeyword(result.attributes)) + return failure(); + + // Parse the module body. + auto *body = result.addRegion(); + if (parser.parseRegion(*body, None, None)) + return failure(); + + // Ensure that this module has a valid terminator. + GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location); + return success(); +} + +static void print(OpAsmPrinter &p, GPUModuleOp op) { + p << op.getOperationName() << ' '; + p.printSymbolName(op.getName()); + p.printOptionalAttrDictWithKeyword(op.getAttrs(), + {SymbolTable::getSymbolAttrName()}); + p.printRegion(op.getOperation()->getRegion(0), /*printEntryBlockArgs=*/false, + /*printBlockTerminators=*/false); +} + // Namespace avoids ambiguous ReturnOpOperandAdaptor. namespace mlir { namespace gpu { diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index 37f9c2e7b84..0f8e2253980 100644 --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -140,8 +140,8 @@ namespace { /// inside a nested module. It also creates an external function of the same /// name in the parent module. /// -/// The kernel modules are intended to be compiled to a cubin blob independently -/// in a separate pass. The external functions can then be annotated with the +/// The gpu.modules are intended to be compiled to a cubin blob independently in +/// a separate pass. The external functions can then be annotated with the /// symbol of the cubin accessor function. class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> { public: @@ -174,15 +174,19 @@ public: } private: - // Returns a module containing kernelFunc and all callees (recursive). - ModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, - const SymbolTable &parentSymbolTable) { + // Returns a gpu.module containing kernelFunc and all callees (recursive). + gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, + const SymbolTable &parentSymbolTable) { + // TODO: This code cannot use an OpBuilder because it must be inserted into + // a SymbolTable by the caller. SymbolTable needs to be refactored to + // prevent manual building of Ops with symbols in code using SymbolTables + // and then this needs to use the OpBuilder. auto context = getModule().getContext(); Builder builder(context); - auto kernelModule = - ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName()); - kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(), - builder.getUnitAttr()); + OperationState state(kernelFunc.getLoc(), + gpu::GPUModuleOp::getOperationName()); + gpu::GPUModuleOp::build(&builder, state, kernelFunc.getName()); + auto kernelModule = cast<gpu::GPUModuleOp>(Operation::create(state)); SymbolTable symbolTable(kernelModule); symbolTable.insert(kernelFunc); diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir index 6865462595f..707f4a06395 100644 --- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir @@ -5,7 +5,7 @@ module attributes {gpu.container_module} { // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00") // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN") - module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} { + gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} { gpu.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} { gpu.return } diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir index 62fe2b99338..78b9f56b620 100644 --- a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s -// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} -module @foo attributes {gpu.kernel_module} { +// CHECK: attributes {nvvm.cubin = "CUBIN"} +gpu.module @foo { llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">) // CHECK: attributes {gpu.kernel} attributes { gpu.kernel } { @@ -11,7 +11,7 @@ module @foo attributes {gpu.kernel_module} { // ----- -module @bar attributes {gpu.kernel_module} { +gpu.module @bar { // CHECK: func @kernel_a llvm.func @kernel_a() attributes { gpu.kernel } { diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index 24bf56557c3..7f69cb7482c 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_index_ops() func @gpu_index_ops() attributes { gpu.kernel } { @@ -38,7 +38,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_all_reduce_op() func @gpu_all_reduce_op() attributes { gpu.kernel } { @@ -55,7 +55,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_all_reduce_region() func @gpu_all_reduce_region() attributes { gpu.kernel } { @@ -74,7 +74,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_shuffle() func @gpu_shuffle() attributes { gpu.kernel } { @@ -99,7 +99,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_sync() func @gpu_sync() attributes { gpu.kernel } { @@ -111,7 +111,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_fabsf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_fabs(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_fabs @@ -126,7 +126,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_ceilf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_ceil(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_ceil @@ -141,7 +141,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_cosf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_cos(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_cos @@ -156,7 +156,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp @@ -174,7 +174,7 @@ module attributes {gpu.kernel_module} { // ----- // Test that we handled properly operation with SymbolTable other than module op -module attributes {gpu.kernel_module} { +gpu.module @test_module { "test.symbol_scope"() ({ // CHECK: test.symbol_scope // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float diff --git a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir index 69a16b25139..115c71d1280 100644 --- a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir +++ b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s -module attributes {gpu.kernel_module} { +gpu.module @kernel { // CHECK-LABEL: llvm.func @private gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) { // Allocate private memory inside the function. @@ -32,7 +32,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel { // Workgroup buffers are allocated as globals. // CHECK: llvm.mlir.global internal @[[buffer:.*]]() // CHECK-SAME: addr_space = 3 @@ -72,7 +72,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel { // Check that the total size was computed correctly. // CHECK: llvm.mlir.global internal @[[buffer:.*]]() // CHECK-SAME: addr_space = 3 @@ -113,7 +113,7 @@ module attributes {gpu.kernel_module} { // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel { // Check that several buffers are defined. // CHECK: llvm.mlir.global internal @[[buffer1:.*]]() // CHECK-SAME: !llvm<"[1 x float]"> diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir index c0a68a9db2a..7f4081e4eda 100644 --- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir @@ -9,7 +9,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_id_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -32,7 +32,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_id_y() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -55,7 +55,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_id_z() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -78,7 +78,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_size_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]] @@ -101,7 +101,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_local_id_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]] @@ -124,7 +124,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_num_workgroups_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]] diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir index d104c96cfa9..446c0d602ed 100644 --- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir @@ -16,7 +16,7 @@ module attributes {gpu.container_module} { } // CHECK-LABEL: spv.module "Logical" "GLSL450" - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZEVAR:@.*]] built_in("WorkgroupSize") : !spv.ptr<vector<3xi32>, Input> // CHECK-DAG: spv.globalVariable [[NUMWORKGROUPSVAR:@.*]] built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input> // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input> diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir index 6d38360b7e8..bd97315a2ea 100644 --- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir @@ -7,7 +7,7 @@ module attributes {gpu.container_module} { return } - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>) attributes {gpu.kernel} { // CHECK: [[LB:%.*]] = spv.constant 4 : i32 diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir index e1b687c1a0b..cca5eb9d0b4 100644 --- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir @@ -2,7 +2,7 @@ module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { // CHECK: spv.module "Logical" "GLSL450" { // CHECK-LABEL: func @kernel_1 // CHECK-SAME: {{%.*}}: f32 {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir index 8323fdf8709..8f900bf6b5c 100644 --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -167,7 +167,7 @@ module attributes {gpu.container_module} { } func @launch_func_missing_module_attribute(%sz : index) { - // expected-error@+1 {{module 'kernels' is missing the 'gpu.kernel_module' attribute}} + // expected-error@+1 {{kernel module 'kernels' is undefined}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) { kernel = "kernel_1", kernel_module = @kernels } : (index, index, index, index, index, index) -> () @@ -178,8 +178,7 @@ module attributes {gpu.container_module} { // ----- module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { - } + gpu.module @kernels { } func @launch_func_undefined_function(%sz : index) { // expected-error@+1 {{kernel function 'kernel_1' is undefined}} @@ -193,7 +192,7 @@ module attributes {gpu.container_module} { // ----- module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm<"float*">) kernel { gpu.return } @@ -211,7 +210,7 @@ module attributes {gpu.container_module} { // ----- module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } { gpu.return } @@ -229,7 +228,7 @@ module attributes {gpu.container_module} { // ----- -module @kernels attributes {gpu.kernel_module} { +gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } { gpu.return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index 1dd08cea492..033e7cbcb7e 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -60,7 +60,7 @@ module attributes {gpu.container_module} { return } - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) attributes {gpu.kernel} { %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index) diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index 5adb881a1dc..425b4b3090c 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -136,7 +136,7 @@ func @recursive_device_function() { gpu.return } -// CHECK: module @function_call_kernel attributes {gpu.kernel_module} { +// CHECK: gpu.module @function_call_kernel { // CHECK: gpu.func @function_call_kernel() // CHECK: call @device_function() : () -> () // CHECK: call @device_function() : () -> () diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp index d6160d6d6e0..a05016f48e8 100644 --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -105,7 +105,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) { applyPassManagerCLOptions(pm); pm.addPass(createGpuKernelOutliningPass()); - auto &kernelPm = pm.nest<ModuleOp>(); + auto &kernelPm = pm.nest<gpu::GPUModuleOp>(); kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin)); pm.addPass(createLowerToLLVMPass()); |