5 files changed, 113 insertions, 40 deletions
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index fcc0d53b2d4..0523282e0a3 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9420,7 +9420,8 @@ public:
   ///
   /// Use this rather than examining the function's attributes yourself -- you
   /// will get it wrong.  Returns CFT_Host if D is null.
-  CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D);
+  CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D,
+                                        bool IgnoreImplicitHDAttr = false);
   CUDAFunctionTarget IdentifyCUDATarget(const AttributeList *Attr);
 
   /// Gets the CUDA target for the current context.
@@ -9522,7 +9523,10 @@ public:
 
   /// Check whether NewFD is a valid overload for CUDA. Emits
   /// diagnostics and invalidates NewFD if not.
-  void checkCUDATargetOverload(FunctionDecl *NewFD, LookupResult &Previous);
+  void checkCUDATargetOverload(FunctionDecl *NewFD,
+                               const LookupResult &Previous);
+  /// Copies target attributes from the template TD to the function FD.
+  void inheritCUDATargetAttrs(FunctionDecl *FD, const FunctionTemplateDecl &TD);
 
   /// \name Code completion
   //@{
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 5e6d0e3e53b..6f272ec839f 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -93,8 +93,17 @@ Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const AttributeList *Attr) {
   return CFT_Host;
 }
 
+template <typename A>
+static bool hasAttr(const FunctionDecl *D, bool IgnoreImplicitAttr) {
+  return D->hasAttrs() && llvm::any_of(D->getAttrs(), [&](Attr *Attribute) {
+           return isa<A>(Attribute) &&
+                  !(IgnoreImplicitAttr && Attribute->isImplicit());
+         });
+}
+
 /// IdentifyCUDATarget - Determine the CUDA compilation target for this function
-Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D) {
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D,
+                                                  bool IgnoreImplicitHDAttr) {
   // Code that lives outside a function is run on the host.
   if (D == nullptr)
     return CFT_Host;
@@ -105,13 +114,13 @@ Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D) {
   if (D->hasAttr<CUDAGlobalAttr>())
     return CFT_Global;
 
-  if (D->hasAttr<CUDADeviceAttr>()) {
-    if (D->hasAttr<CUDAHostAttr>())
+  if (hasAttr<CUDADeviceAttr>(D, IgnoreImplicitHDAttr)) {
+    if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr))
       return CFT_HostDevice;
     return CFT_Device;
-  } else if (D->hasAttr<CUDAHostAttr>()) {
+  } else if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr)) {
     return CFT_Host;
-  } else if (D->isImplicit()) {
+  } else if (D->isImplicit() && !IgnoreImplicitHDAttr) {
     // Some implicit declarations (like intrinsic functions) are not marked.
     // Set the most lenient target on them for maximal flexibility.
     return CFT_HostDevice;
@@ -856,7 +865,7 @@ void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
 }
 
 void Sema::checkCUDATargetOverload(FunctionDecl *NewFD,
-                                   LookupResult &Previous) {
+                                   const LookupResult &Previous) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   CUDAFunctionTarget NewTarget = IdentifyCUDATarget(NewFD);
   for (NamedDecl *OldND : Previous) {
@@ -883,3 +892,21 @@ void Sema::checkCUDATargetOverload(FunctionDecl *NewFD,
     }
   }
 }
+
+template <typename AttrTy>
+static void copyAttrIfPresent(Sema &S, FunctionDecl *FD,
+                              const FunctionDecl &TemplateFD) {
+  if (AttrTy *Attribute = TemplateFD.getAttr<AttrTy>()) {
+    AttrTy *Clone = Attribute->clone(S.Context);
+    Clone->setInherited(true);
+    FD->addAttr(Clone);
+  }
+}
+
+void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
+                                  const FunctionTemplateDecl &TD) {
+  const FunctionDecl &TemplateFD = *TD.getTemplatedDecl();
+  copyAttrIfPresent<CUDAGlobalAttr>(*this, FD, TemplateFD);
+  copyAttrIfPresent<CUDAHostAttr>(*this, FD, TemplateFD);
+  copyAttrIfPresent<CUDADeviceAttr>(*this, FD, TemplateFD);
+}
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index be2466c9235..5d13c8fa039 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8305,9 +8305,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
   // Handle attributes.
   ProcessDeclAttributes(S, NewFD, D);
 
-  if (getLangOpts().CUDA)
-    maybeAddCUDAHostDeviceAttrs(NewFD, Previous);
-
   if (getLangOpts().OpenCL) {
     // OpenCL v1.1 s6.5: Using an address space qualifier in a function return
     // type declaration will generate a compilation error.
@@ -8410,6 +8407,15 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
       TemplateArgs.setRAngleLoc(D.getIdentifierLoc());
     }
 
+    // We do not add HD attributes to specializations here because
+    // they may have different constexpr-ness compared to their
+    // templates and, after maybeAddCUDAHostDeviceAttrs() is applied,
+    // may end up with different effective targets. Instead, a
+    // specialization inherits its target attributes from its template
+    // in the CheckFunctionTemplateSpecialization() call below.
+    if (getLangOpts().CUDA & !isFunctionTemplateSpecialization)
+      maybeAddCUDAHostDeviceAttrs(NewFD, Previous);
+
     // If it's a friend (and only if it's a friend), it's possible
     // that either the specialized function type or the specialized
     // template is dependent, and therefore matching will fail.  In
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 898765cbd79..4846b25232a 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -7043,13 +7043,15 @@ bool Sema::CheckFunctionTemplateSpecialization(
         continue;
       }
 
-      // Target attributes are part of function signature during cuda
-      // compilation, so deduced template must also have matching CUDA
-      // target. Given that regular template deduction does not take
-      // target attributes into account, we perform target match check
-      // here and reject candidates that have different target.
+      // Target attributes are part of the cuda function signature, so
+      // the deduced template's cuda target must match that of the
+      // specialization.  Given that C++ template deduction does not
+      // take target attributes into account, we reject candidates
+      // here that have a different target.
       if (LangOpts.CUDA &&
-          IdentifyCUDATarget(Specialization) != IdentifyCUDATarget(FD)) {
+          IdentifyCUDATarget(Specialization,
+                             /* IgnoreImplicitHDAttributes = */ true) !=
+              IdentifyCUDATarget(FD, /* IgnoreImplicitHDAttributes = */ true)) {
         FailedCandidates.addCandidate().set(
             I.getPair(), FunTmpl->getTemplatedDecl(),
             MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
@@ -7166,6 +7168,14 @@ bool Sema::CheckFunctionTemplateSpecialization(
       SpecInfo->getTemplateSpecializationKind(),
       ExplicitTemplateArgs ? &ConvertedTemplateArgs[Specialization] : nullptr);
 
+  // A function template specialization inherits the target attributes
+  // of its template.  (We require the attributes explicitly in the
+  // code to match, but a template may have implicit attributes by
+  // virtue e.g. of being constexpr, and it passes these implicit
+  // attributes on to its specializations.)
+  if (LangOpts.CUDA)
+    inheritCUDATargetAttrs(FD, *Specialization->getPrimaryTemplate());
+
   // The "previous declaration" for this function template specialization is
   // the prior function template specialization.
   Previous.clear();
@@ -8154,24 +8164,19 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
       continue;
     }
 
-    // Target attributes are part of function signature during cuda
-    // compilation, so deduced template must also have matching CUDA
-    // target. Given that regular template deduction does not take it
-    // into account, we perform target match check here and reject
-    // candidates that have different target.
-    if (LangOpts.CUDA) {
-      CUDAFunctionTarget DeclaratorTarget = IdentifyCUDATarget(Attr);
-      // We need to adjust target when HD is forced by
-      // #pragma clang force_cuda_host_device
-      if (ForceCUDAHostDeviceDepth > 0 &&
-          (DeclaratorTarget == CFT_Device || DeclaratorTarget == CFT_Host))
-        DeclaratorTarget = CFT_HostDevice;
-      if (IdentifyCUDATarget(Specialization) != DeclaratorTarget) {
-        FailedCandidates.addCandidate().set(
-            P.getPair(), FunTmpl->getTemplatedDecl(),
-            MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
-        continue;
-      }
+    // Target attributes are part of the cuda function signature, so
+    // the cuda target of the instantiated function must match that of its
+    // template.  Given that C++ template deduction does not take
+    // target attributes into account, we reject candidates here that
+    // have a different target.
+    if (LangOpts.CUDA &&
+        IdentifyCUDATarget(Specialization,
+                           /* IgnoreImplicitHDAttributes = */ true) !=
+            IdentifyCUDATarget(Attr)) {
+      FailedCandidates.addCandidate().set(
+          P.getPair(), FunTmpl->getTemplatedDecl(),
+          MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
+      continue;
     }
 
     Matches.addDecl(Specialization, P.getAccess());
diff --git a/clang/test/SemaCUDA/function-template-overload.cu b/clang/test/SemaCUDA/function-template-overload.cu
index 8adeb849570..56bba653958 100644
--- a/clang/test/SemaCUDA/function-template-overload.cu
+++ b/clang/test/SemaCUDA/function-template-overload.cu
@@ -31,7 +31,8 @@ template  <> __device__ DType overload_h_d(long a); // OK. instantiates D
 template  <> __host__ HType overload_h_d(long a); // OK. instantiates H
 
 
-// Can't overload HD template with H or D template, though functions are OK.
+// Can't overload HD template with H or D template, though
+// non-template functions are OK.
 template <typename T> __host__ __device__ HDType overload_hd(T a) { return HDType(); }
 // expected-note@-1 {{previous declaration is here}}
 // expected-note@-2 2 {{candidate template ignored: could not match 'HDType' against 'HType'}}
@@ -56,24 +57,54 @@ template <typename T> __host__ HType overload_h_d2(T a) { return HType(); }
 template <typename T> __host__ __device__ HDType overload_h_d2(T a) { return HDType(); }
 template <typename T1, typename T2 = int> __device__ DType overload_h_d2(T1 a) { T1 x; T2 y; return DType(); }
 
+// constexpr functions are implicitly HD, but explicit
+// instantiation/specialization must use target attributes as written.
+template <typename T> constexpr T overload_ce_implicit_hd(T a) { return a+1; }
+// expected-note@-1 3 {{candidate template ignored: target attributes do not match}}
+
+// These will not match the template.
+template __host__ __device__ int overload_ce_implicit_hd(int a);
+// expected-error@-1 {{explicit instantiation of 'overload_ce_implicit_hd' does not refer to a function template, variable template, member function, member class, or static data member}}
+template <> __host__ __device__ long overload_ce_implicit_hd(long a);
+// expected-error@-1 {{no function template matches function template specialization 'overload_ce_implicit_hd'}}
+template <> __host__ __device__ constexpr long overload_ce_implicit_hd(long a);
+// expected-error@-1 {{no function template matches function template specialization 'overload_ce_implicit_hd'}}
+
+// These should work, because template matching ignores the implicit
+// HD attributes the compiler gives to constexpr functions/templates,
+// so 'overload_ce_implicit_hd' template will match __host__ functions
+// only.
+template __host__ int overload_ce_implicit_hd(int a);
+template <> __host__ long overload_ce_implicit_hd(long a);
+
+template float overload_ce_implicit_hd(float a);
+template <> float* overload_ce_implicit_hd(float *a);
+template <> constexpr double overload_ce_implicit_hd(double a) { return a + 3.0; };
+
 __host__ void hf() {
   overload_hd(13);
+  overload_ce_implicit_hd('h');        // Implicitly instantiated
+  overload_ce_implicit_hd(1.0f);       // Explicitly instantiated
+  overload_ce_implicit_hd(2.0);        // Explicitly specialized
 
   HType h = overload_h_d(10);
   HType h2i = overload_h_d2<int>(11);
   HType h2ii = overload_h_d2<int>(12);
 
   // These should be implicitly instantiated from __host__ template returning HType.
-  DType d = overload_h_d(20); // expected-error {{no viable conversion from 'HType' to 'DType'}}
-  DType d2i = overload_h_d2<int>(21); // expected-error {{no viable conversion from 'HType' to 'DType'}}
+  DType d = overload_h_d(20);          // expected-error {{no viable conversion from 'HType' to 'DType'}}
+  DType d2i = overload_h_d2<int>(21);  // expected-error {{no viable conversion from 'HType' to 'DType'}}
   DType d2ii = overload_h_d2<int>(22); // expected-error {{no viable conversion from 'HType' to 'DType'}}
 }
 __device__ void df() {
   overload_hd(23);
+  overload_ce_implicit_hd('d');        // Implicitly instantiated
+  overload_ce_implicit_hd(1.0f);       // Explicitly instantiated
+  overload_ce_implicit_hd(2.0);        // Explicitly specialized
 
   // These should be implicitly instantiated from __device__ template returning DType.
-  HType h = overload_h_d(10); // expected-error {{no viable conversion from 'DType' to 'HType'}}
-  HType h2i = overload_h_d2<int>(11); // expected-error {{no viable conversion from 'DType' to 'HType'}}
+  HType h = overload_h_d(10);          // expected-error {{no viable conversion from 'DType' to 'HType'}}
+  HType h2i = overload_h_d2<int>(11);  // expected-error {{no viable conversion from 'DType' to 'HType'}}
   HType h2ii = overload_h_d2<int>(12); // expected-error {{no viable conversion from 'DType' to 'HType'}}
 
   DType d = overload_h_d(20);