diff options
| -rw-r--r-- | clang/lib/CodeGen/CGCUDANV.cpp | 66 | ||||
| -rw-r--r-- | clang/lib/CodeGen/CGCUDARuntime.h | 8 | ||||
| -rw-r--r-- | clang/lib/CodeGen/CodeGenModule.cpp | 64 | ||||
| -rw-r--r-- | clang/test/CodeGenCUDA/device-stub.cu | 47 | ||||
| -rw-r--r-- | clang/test/CodeGenCUDA/filter-decl.cu | 6 | 
5 files changed, 157 insertions, 34 deletions
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 9dd7928bcf9..f0ecb57c714 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -38,6 +38,7 @@ private:    llvm::Module &TheModule;    /// Keeps track of kernel launch stubs emitted in this module    llvm::SmallVector<llvm::Function *, 16> EmittedKernels; +  llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;    /// Keeps track of variables containing handles of GPU binaries. Populated by    /// ModuleCtorFunction() and used to create corresponding cleanup calls in    /// ModuleDtorFunction() @@ -47,7 +48,7 @@ private:    llvm::Constant *getLaunchFn() const;    /// Creates a function to register all kernel stubs generated in this module. -  llvm::Function *makeRegisterKernelsFn(); +  llvm::Function *makeRegisterGlobalsFn();    /// Helper function that generates a constant string and returns a pointer to    /// the start of the string.  The result of this function can be used anywhere @@ -68,6 +69,10 @@ public:    CGNVCUDARuntime(CodeGenModule &CGM);    void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; +  void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { +    DeviceVars.push_back(std::make_pair(&Var, Flags)); +  } +    /// Creates module constructor function    llvm::Function *makeModuleCtorFunction() override;    /// Creates module destructor function @@ -158,19 +163,24 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,    CGF.EmitBlock(EndBlock);  } -/// Creates internal function to register all kernel stubs generated in this -/// module with the CUDA runtime. +/// Creates a function that sets up state on the host side for CUDA objects that +/// have a presence on both the host and device sides. Specifically, registers +/// the host side of kernel functions and device global variables with the CUDA +/// runtime.  /// \code -/// void __cuda_register_kernels(void** GpuBinaryHandle) { +/// void __cuda_register_globals(void** GpuBinaryHandle) {  ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);  ///    ...  ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); +///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); +///    ... +///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);  /// }  /// \endcode -llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { +llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {    llvm::Function *RegisterKernelsFunc = llvm::Function::Create(        llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), -      llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule); +      llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);    llvm::BasicBlock *EntryBB =        llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);    CGBuilderTy Builder(CGM, Context); @@ -186,18 +196,44 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {        "__cudaRegisterFunction");    // Extract GpuBinaryHandle passed as the first argument passed to -  // __cuda_register_kernels() and generate __cudaRegisterFunction() call for +  // __cuda_register_globals() and generate __cudaRegisterFunction() call for    // each emitted kernel.    llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();    for (llvm::Function *Kernel : EmittedKernels) {      llvm::Constant *KernelName = makeConstantString(Kernel->getName());      llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); -    llvm::Value *args[] = { +    llvm::Value *Args[] = {          &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),          KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,          NullPtr, NullPtr, NullPtr,          llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; -    Builder.CreateCall(RegisterFunc, args); +    Builder.CreateCall(RegisterFunc, Args); +  } + +  // void __cudaRegisterVar(void **, char *, char *, const char *, +  //                        int, int, int, int) +  std::vector<llvm::Type *> RegisterVarParams = { +      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, +      IntTy,        IntTy,     IntTy,     IntTy}; +  llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( +      llvm::FunctionType::get(IntTy, RegisterVarParams, false), +      "__cudaRegisterVar"); +  for (auto &Pair : DeviceVars) { +    llvm::GlobalVariable *Var = Pair.first; +    unsigned Flags = Pair.second; +    llvm::Constant *VarName = makeConstantString(Var->getName()); +    uint64_t VarSize = +        CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); +    llvm::Value *Args[] = { +        &GpuBinaryHandlePtr, +        Builder.CreateBitCast(Var, VoidPtrTy), +        VarName, +        VarName, +        llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), +        llvm::ConstantInt::get(IntTy, VarSize), +        llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), +        llvm::ConstantInt::get(IntTy, 0)}; +    Builder.CreateCall(RegisterVar, Args);    }    Builder.CreateRetVoid(); @@ -208,15 +244,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {  /// \code  /// void __cuda_module_ctor(void*) {  ///     Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); -///     __cuda_register_kernels(Handle0); +///     __cuda_register_globals(Handle0);  ///     ...  ///     HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); -///     __cuda_register_kernels(HandleN); +///     __cuda_register_globals(HandleN);  /// }  /// \endcode  llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { -  // void __cuda_register_kernels(void* handle); -  llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn(); +  // void __cuda_register_globals(void* handle); +  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();    // void ** __cudaRegisterFatBinary(void *);    llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(        llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), @@ -272,8 +308,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {      CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,                                     CGM.getPointerAlign()); -    // Call __cuda_register_kernels(GpuBinaryHandle); -    CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall); +    // Call __cuda_register_globals(GpuBinaryHandle); +    CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);      // Save GpuBinaryHandle so we can unregister it in destructor.      GpuBinaryHandles.push_back(GpuBinaryHandle); diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h index dcacf970327..0168f4f9e94 100644 --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -18,6 +18,7 @@  namespace llvm {  class Function; +class GlobalVariable;  }  namespace clang { @@ -37,6 +38,12 @@ protected:    CodeGenModule &CGM;  public: +  // Global variable properties that must be passed to CUDA runtime. +  enum DeviceVarFlags { +    ExternDeviceVar = 0x01,   // extern +    ConstantDeviceVar = 0x02, // __constant__ +  }; +    CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}    virtual ~CGCUDARuntime(); @@ -46,6 +53,7 @@ public:    /// Emits a kernel launch stub.    virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0; +  virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0;    /// Constructs and returns a module initialization function or nullptr if it's    /// not needed. Must be called after all kernels have been emitted. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 887c53331f6..286ccb1595f 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1528,11 +1528,18 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {            !Global->hasAttr<CUDASharedAttr>())          return;      } else { -      if (!Global->hasAttr<CUDAHostAttr>() && ( -            Global->hasAttr<CUDADeviceAttr>() || -            Global->hasAttr<CUDAConstantAttr>() || -            Global->hasAttr<CUDASharedAttr>())) +      // We need to emit host-side 'shadows' for all global +      // device-side variables because the CUDA runtime needs their +      // size and host-side address in order to provide access to +      // their device-side incarnations. + +      // So device-only functions are the only things we skip. +      if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() && +          Global->hasAttr<CUDADeviceAttr>())          return; + +      assert((isa<FunctionDecl>(Global) || isa<VarDecl>(Global)) && +             "Expected Variable or Function");      }    } @@ -1561,8 +1568,15 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {    } else {      const auto *VD = cast<VarDecl>(Global);      assert(VD->isFileVarDecl() && "Cannot emit local var decl as global."); - -    if (VD->isThisDeclarationADefinition() != VarDecl::Definition && +    // We need to emit device-side global CUDA variables even if a +    // variable does not have a definition -- we still need to define +    // host-side shadow for it. +    bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice && +                           !VD->hasDefinition() && +                           (VD->hasAttr<CUDAConstantAttr>() || +                            VD->hasAttr<CUDADeviceAttr>()); +    if (!MustEmitForCuda && +        VD->isThisDeclarationADefinition() != VarDecl::Definition &&          !Context.isMSStaticDataMemberInlineDefinition(VD))        return;    } @@ -2444,6 +2458,10 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,    if (D->hasAttr<AnnotateAttr>())      AddGlobalAnnotations(D, GV); +  // Set the llvm linkage type as appropriate. +  llvm::GlobalValue::LinkageTypes Linkage = +      getLLVMLinkageVarDefinition(D, GV->isConstant()); +    // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on    // the device. [...]"    // CUDA B.2.2 "The __constant__ qualifier, optionally used together with @@ -2451,9 +2469,34 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,    // Is accessible from all the threads within the grid and from the host    // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()    // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())." -  if (GV && LangOpts.CUDA && LangOpts.CUDAIsDevice && -      (D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>())) { -    GV->setExternallyInitialized(true); +  if (GV && LangOpts.CUDA) { +    if (LangOpts.CUDAIsDevice) { +      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) +        GV->setExternallyInitialized(true); +    } else { +      // Host-side shadows of external declarations of device-side +      // global variables become internal definitions. These have to +      // be internal in order to prevent name conflicts with global +      // host variables with the same name in a different TUs. +      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { +        Linkage = llvm::GlobalValue::InternalLinkage; + +        // Shadow variables and their properties must be registered +        // with CUDA runtime. +        unsigned Flags = 0; +        if (!D->hasDefinition()) +          Flags |= CGCUDARuntime::ExternDeviceVar; +        if (D->hasAttr<CUDAConstantAttr>()) +          Flags |= CGCUDARuntime::ConstantDeviceVar; +        getCUDARuntime().registerDeviceVar(*GV, Flags); +      } else if (D->hasAttr<CUDASharedAttr>()) +        // __shared__ variables are odd. Shadows do get created, but +        // they are not registered with the CUDA runtime, so they +        // can't really be used to access their device-side +        // counterparts. It's not clear yet whether it's nvcc's bug or +        // a feature, but we've got to do the same for compatibility. +        Linkage = llvm::GlobalValue::InternalLinkage; +    }    }    GV->setInitializer(Init); @@ -2470,9 +2513,6 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,    GV->setAlignment(getContext().getDeclAlign(D).getQuantity()); -  // Set the llvm linkage type as appropriate. -  llvm::GlobalValue::LinkageTypes Linkage = -      getLLVMLinkageVarDefinition(D, GV->isConstant());    // On Darwin, if the normal linkage of a C++ thread_local variable is    // LinkOnce or Weak, we keep the normal linkage to prevent multiple diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu index 7f5e159151c..81d23a2990d 100644 --- a/clang/test/CodeGenCUDA/device-stub.cu +++ b/clang/test/CodeGenCUDA/device-stub.cu @@ -2,6 +2,40 @@  #include "Inputs/cuda.h" +// CHECK-DAG: @device_var = internal global i32 +__device__ int device_var; + +// CHECK-DAG: @constant_var = internal global i32 +__constant__ int constant_var; + +// CHECK-DAG: @shared_var = internal global i32 +__shared__ int shared_var; + +// Make sure host globals don't get internalized... +// CHECK-DAG: @host_var = global i32 +int host_var; +// ... and that extern vars remain external. +// CHECK-DAG: @ext_host_var = external global i32 +extern int ext_host_var; + +// Shadows for external device-side variables are *definitions* of +// those variables. +// CHECK-DAG: @ext_device_var = internal global i32 +extern __device__ int ext_device_var; +// CHECK-DAG: @ext_device_var = internal global i32 +extern __constant__ int ext_constant_var; + +void use_pointers() { +  int *p; +  p = &device_var; +  p = &constant_var; +  p = &shared_var; +  p = &host_var; +  p = &ext_device_var; +  p = &ext_constant_var; +  p = &ext_host_var; +} +  // Make sure that all parts of GPU code init/cleanup are there:  // * constant unnamed string with the kernel name  // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" @@ -32,9 +66,14 @@ __global__ void kernelfunc(int i, int j, int k) {}  // CHECK: call{{.*}}kernelfunc  void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } -// Test that we've built a function to register kernels -// CHECK: define internal void @__cuda_register_kernels +// Test that we've built a function to register kernels and global vars. +// CHECK: define internal void @__cuda_register_globals  // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 +// CHECK: ret void  // Test that we've built contructor..  // CHECK: define internal void @__cuda_module_ctor @@ -42,8 +81,8 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }  // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper  //   .. stores return value in __cuda_gpubin_handle  // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle -//   .. and then calls __cuda_register_kernels -// CHECK-NEXT: call void @__cuda_register_kernels +//   .. and then calls __cuda_register_globals +// CHECK-NEXT: call void @__cuda_register_globals  // Test that we've created destructor.  // CHECK: define internal void @__cuda_module_dtor diff --git a/clang/test/CodeGenCUDA/filter-decl.cu b/clang/test/CodeGenCUDA/filter-decl.cu index 023ae61f3af..bc744a07a33 100644 --- a/clang/test/CodeGenCUDA/filter-decl.cu +++ b/clang/test/CodeGenCUDA/filter-decl.cu @@ -9,15 +9,15 @@  // CHECK-DEVICE-NOT: module asm "file scope asm is host only"  __asm__("file scope asm is host only"); -// CHECK-HOST-NOT: constantdata = externally_initialized global +// CHECK-HOST: constantdata = internal global  // CHECK-DEVICE: constantdata = externally_initialized global  __constant__ char constantdata[256]; -// CHECK-HOST-NOT: devicedata = externally_initialized global +// CHECK-HOST: devicedata = internal global  // CHECK-DEVICE: devicedata = externally_initialized global  __device__ char devicedata[256]; -// CHECK-HOST-NOT: shareddata = global +// CHECK-HOST: shareddata = internal global  // CHECK-DEVICE: shareddata = global  __shared__ char shareddata[256];  | 

