5 files changed, 157 insertions, 34 deletions
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 9dd7928bcf9..f0ecb57c714 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -38,6 +38,7 @@ private:
   llvm::Module &TheModule;
   /// Keeps track of kernel launch stubs emitted in this module
   llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
+  llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
   /// Keeps track of variables containing handles of GPU binaries. Populated by
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
@@ -47,7 +48,7 @@ private:
   llvm::Constant *getLaunchFn() const;
 
   /// Creates a function to register all kernel stubs generated in this module.
-  llvm::Function *makeRegisterKernelsFn();
+  llvm::Function *makeRegisterGlobalsFn();
 
   /// Helper function that generates a constant string and returns a pointer to
   /// the start of the string.  The result of this function can be used anywhere
@@ -68,6 +69,10 @@ public:
   CGNVCUDARuntime(CodeGenModule &CGM);
 
   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
+  void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
+    DeviceVars.push_back(std::make_pair(&Var, Flags));
+  }
+
   /// Creates module constructor function
   llvm::Function *makeModuleCtorFunction() override;
   /// Creates module destructor function
@@ -158,19 +163,24 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
   CGF.EmitBlock(EndBlock);
 }
 
-/// Creates internal function to register all kernel stubs generated in this
-/// module with the CUDA runtime.
+/// Creates a function that sets up state on the host side for CUDA objects that
+/// have a presence on both the host and device sides. Specifically, registers
+/// the host side of kernel functions and device global variables with the CUDA
+/// runtime.
 /// \code
-/// void __cuda_register_kernels(void** GpuBinaryHandle) {
+/// void __cuda_register_globals(void** GpuBinaryHandle) {
 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
 ///    ...
 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
+///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
+///    ...
+///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
 /// }
 /// \endcode
-llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
+llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule);
+      llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);
   llvm::BasicBlock *EntryBB =
       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
   CGBuilderTy Builder(CGM, Context);
@@ -186,18 +196,44 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
       "__cudaRegisterFunction");
 
   // Extract GpuBinaryHandle passed as the first argument passed to
-  // __cuda_register_kernels() and generate __cudaRegisterFunction() call for
+  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
   // each emitted kernel.
   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
   for (llvm::Function *Kernel : EmittedKernels) {
     llvm::Constant *KernelName = makeConstantString(Kernel->getName());
     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
-    llvm::Value *args[] = {
+    llvm::Value *Args[] = {
         &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
         KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
         NullPtr, NullPtr, NullPtr,
         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
-    Builder.CreateCall(RegisterFunc, args);
+    Builder.CreateCall(RegisterFunc, Args);
+  }
+
+  // void __cudaRegisterVar(void **, char *, char *, const char *,
+  //                        int, int, int, int)
+  std::vector<llvm::Type *> RegisterVarParams = {
+      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy,
+      IntTy,        IntTy,     IntTy,     IntTy};
+  llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(IntTy, RegisterVarParams, false),
+      "__cudaRegisterVar");
+  for (auto &Pair : DeviceVars) {
+    llvm::GlobalVariable *Var = Pair.first;
+    unsigned Flags = Pair.second;
+    llvm::Constant *VarName = makeConstantString(Var->getName());
+    uint64_t VarSize =
+        CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
+    llvm::Value *Args[] = {
+        &GpuBinaryHandlePtr,
+        Builder.CreateBitCast(Var, VoidPtrTy),
+        VarName,
+        VarName,
+        llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
+        llvm::ConstantInt::get(IntTy, VarSize),
+        llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
+        llvm::ConstantInt::get(IntTy, 0)};
+    Builder.CreateCall(RegisterVar, Args);
   }
 
   Builder.CreateRetVoid();
@@ -208,15 +244,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
 /// \code
 /// void __cuda_module_ctor(void*) {
 ///     Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
-///     __cuda_register_kernels(Handle0);
+///     __cuda_register_globals(Handle0);
 ///     ...
 ///     HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
-///     __cuda_register_kernels(HandleN);
+///     __cuda_register_globals(HandleN);
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
-  // void __cuda_register_kernels(void* handle);
-  llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
+  // void __cuda_register_globals(void* handle);
+  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
   // void ** __cudaRegisterFatBinary(void *);
   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
@@ -272,8 +308,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                    CGM.getPointerAlign());
 
-    // Call __cuda_register_kernels(GpuBinaryHandle);
-    CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
+    // Call __cuda_register_globals(GpuBinaryHandle);
+    CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
 
     // Save GpuBinaryHandle so we can unregister it in destructor.
     GpuBinaryHandles.push_back(GpuBinaryHandle);
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index dcacf970327..0168f4f9e94 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -18,6 +18,7 @@
 
 namespace llvm {
 class Function;
+class GlobalVariable;
 }
 
 namespace clang {
@@ -37,6 +38,12 @@ protected:
   CodeGenModule &CGM;
 
 public:
+  // Global variable properties that must be passed to CUDA runtime.
+  enum DeviceVarFlags {
+    ExternDeviceVar = 0x01,   // extern
+    ConstantDeviceVar = 0x02, // __constant__
+  };
+
   CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
   virtual ~CGCUDARuntime();
 
@@ -46,6 +53,7 @@ public:
 
   /// Emits a kernel launch stub.
   virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
+  virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0;
 
   /// Constructs and returns a module initialization function or nullptr if it's
   /// not needed. Must be called after all kernels have been emitted.
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 887c53331f6..286ccb1595f 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1528,11 +1528,18 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
           !Global->hasAttr<CUDASharedAttr>())
         return;
     } else {
-      if (!Global->hasAttr<CUDAHostAttr>() && (
-            Global->hasAttr<CUDADeviceAttr>() ||
-            Global->hasAttr<CUDAConstantAttr>() ||
-            Global->hasAttr<CUDASharedAttr>()))
+      // We need to emit host-side 'shadows' for all global
+      // device-side variables because the CUDA runtime needs their
+      // size and host-side address in order to provide access to
+      // their device-side incarnations.
+
+      // So device-only functions are the only things we skip.
+      if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() &&
+          Global->hasAttr<CUDADeviceAttr>())
         return;
+
+      assert((isa<FunctionDecl>(Global) || isa<VarDecl>(Global)) &&
+             "Expected Variable or Function");
     }
   }
 
@@ -1561,8 +1568,15 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
   } else {
     const auto *VD = cast<VarDecl>(Global);
     assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
-
-    if (VD->isThisDeclarationADefinition() != VarDecl::Definition &&
+    // We need to emit device-side global CUDA variables even if a
+    // variable does not have a definition -- we still need to define
+    // host-side shadow for it.
+    bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
+                           !VD->hasDefinition() &&
+                           (VD->hasAttr<CUDAConstantAttr>() ||
+                            VD->hasAttr<CUDADeviceAttr>());
+    if (!MustEmitForCuda &&
+        VD->isThisDeclarationADefinition() != VarDecl::Definition &&
         !Context.isMSStaticDataMemberInlineDefinition(VD))
       return;
   }
@@ -2444,6 +2458,10 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
   if (D->hasAttr<AnnotateAttr>())
     AddGlobalAnnotations(D, GV);
 
+  // Set the llvm linkage type as appropriate.
+  llvm::GlobalValue::LinkageTypes Linkage =
+      getLLVMLinkageVarDefinition(D, GV->isConstant());
+
   // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
   // the device. [...]"
   // CUDA B.2.2 "The __constant__ qualifier, optionally used together with
@@ -2451,9 +2469,34 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
   // Is accessible from all the threads within the grid and from the host
   // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
   // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
-  if (GV && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
-      (D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>())) {
-    GV->setExternallyInitialized(true);
+  if (GV && LangOpts.CUDA) {
+    if (LangOpts.CUDAIsDevice) {
+      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>())
+        GV->setExternallyInitialized(true);
+    } else {
+      // Host-side shadows of external declarations of device-side
+      // global variables become internal definitions. These have to
+      // be internal in order to prevent name conflicts with global
+      // host variables with the same name in a different TUs.
+      if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
+        Linkage = llvm::GlobalValue::InternalLinkage;
+
+        // Shadow variables and their properties must be registered
+        // with CUDA runtime.
+        unsigned Flags = 0;
+        if (!D->hasDefinition())
+          Flags |= CGCUDARuntime::ExternDeviceVar;
+        if (D->hasAttr<CUDAConstantAttr>())
+          Flags |= CGCUDARuntime::ConstantDeviceVar;
+        getCUDARuntime().registerDeviceVar(*GV, Flags);
+      } else if (D->hasAttr<CUDASharedAttr>())
+        // __shared__ variables are odd. Shadows do get created, but
+        // they are not registered with the CUDA runtime, so they
+        // can't really be used to access their device-side
+        // counterparts. It's not clear yet whether it's nvcc's bug or
+        // a feature, but we've got to do the same for compatibility.
+        Linkage = llvm::GlobalValue::InternalLinkage;
+    }
   }
   GV->setInitializer(Init);
 
@@ -2470,9 +2513,6 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
 
   GV->setAlignment(getContext().getDeclAlign(D).getQuantity());
 
-  // Set the llvm linkage type as appropriate.
-  llvm::GlobalValue::LinkageTypes Linkage =
-      getLLVMLinkageVarDefinition(D, GV->isConstant());
 
   // On Darwin, if the normal linkage of a C++ thread_local variable is
   // LinkOnce or Weak, we keep the normal linkage to prevent multiple
diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu
index 7f5e159151c..81d23a2990d 100644
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@@ -2,6 +2,40 @@
 
 #include "Inputs/cuda.h"
 
+// CHECK-DAG: @device_var = internal global i32
+__device__ int device_var;
+
+// CHECK-DAG: @constant_var = internal global i32
+__constant__ int constant_var;
+
+// CHECK-DAG: @shared_var = internal global i32
+__shared__ int shared_var;
+
+// Make sure host globals don't get internalized...
+// CHECK-DAG: @host_var = global i32
+int host_var;
+// ... and that extern vars remain external.
+// CHECK-DAG: @ext_host_var = external global i32
+extern int ext_host_var;
+
+// Shadows for external device-side variables are *definitions* of
+// those variables.
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __device__ int ext_device_var;
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __constant__ int ext_constant_var;
+
+void use_pointers() {
+  int *p;
+  p = &device_var;
+  p = &constant_var;
+  p = &shared_var;
+  p = &host_var;
+  p = &ext_device_var;
+  p = &ext_constant_var;
+  p = &ext_host_var;
+}
+
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the kernel name
 // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
@@ -32,9 +66,14 @@ __global__ void kernelfunc(int i, int j, int k) {}
 // CHECK: call{{.*}}kernelfunc
 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 
-// Test that we've built a function to register kernels
-// CHECK: define internal void @__cuda_register_kernels
+// Test that we've built a function to register kernels and global vars.
+// CHECK: define internal void @__cuda_register_globals
 // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
+// CHECK: ret void
 
 // Test that we've built contructor..
 // CHECK: define internal void @__cuda_module_ctor
@@ -42,8 +81,8 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
 //   .. stores return value in __cuda_gpubin_handle
 // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
-//   .. and then calls __cuda_register_kernels
-// CHECK-NEXT: call void @__cuda_register_kernels
+//   .. and then calls __cuda_register_globals
+// CHECK-NEXT: call void @__cuda_register_globals
 
 // Test that we've created destructor.
 // CHECK: define internal void @__cuda_module_dtor
diff --git a/clang/test/CodeGenCUDA/filter-decl.cu b/clang/test/CodeGenCUDA/filter-decl.cu
index 023ae61f3af..bc744a07a33 100644
--- a/clang/test/CodeGenCUDA/filter-decl.cu
+++ b/clang/test/CodeGenCUDA/filter-decl.cu
@@ -9,15 +9,15 @@
 // CHECK-DEVICE-NOT: module asm "file scope asm is host only"
 __asm__("file scope asm is host only");
 
-// CHECK-HOST-NOT: constantdata = externally_initialized global
+// CHECK-HOST: constantdata = internal global
 // CHECK-DEVICE: constantdata = externally_initialized global
 __constant__ char constantdata[256];
 
-// CHECK-HOST-NOT: devicedata = externally_initialized global
+// CHECK-HOST: devicedata = internal global
 // CHECK-DEVICE: devicedata = externally_initialized global
 __device__ char devicedata[256];
 
-// CHECK-HOST-NOT: shareddata = global
+// CHECK-HOST: shareddata = internal global
 // CHECK-DEVICE: shareddata = global
 __shared__ char shareddata[256];