Index: cfe/trunk/include/clang/AST/ASTContext.h
===================================================================
--- cfe/trunk/include/clang/AST/ASTContext.h
+++ cfe/trunk/include/clang/AST/ASTContext.h
@@ -448,12 +448,6 @@
   /// \brief Allocator for partial diagnostics.
   PartialDiagnostic::StorageAllocator DiagAllocator;
 
-  /// Diagnostics that are emitted if and only if the given function is
-  /// codegen'ed.  Access these through FunctionDecl::addDeferredDiag() and
-  /// FunctionDecl::takeDeferredDiags().
-  llvm::DenseMap<const FunctionDecl *, std::vector<PartialDiagnosticAt>>
-      DeferredDiags;
-
   /// \brief The current C++ ABI.
   std::unique_ptr<CXXABI> ABI;
   CXXABI *createCXXABI(const TargetInfo &T);
@@ -604,11 +598,6 @@
     return DiagAllocator;
   }
 
-  decltype(DeferredDiags) &getDeferredDiags() { return DeferredDiags; }
-  const decltype(DeferredDiags) &getDeferredDiags() const {
-    return DeferredDiags;
-  }
-
   const TargetInfo &getTargetInfo() const { return *Target; }
   const TargetInfo *getAuxTargetInfo() const { return AuxTarget; }
 
Index: cfe/trunk/include/clang/AST/Decl.h
===================================================================
--- cfe/trunk/include/clang/AST/Decl.h
+++ cfe/trunk/include/clang/AST/Decl.h
@@ -2271,14 +2271,6 @@
   /// returns 0.
   unsigned getMemoryFunctionKind() const;
 
-  /// Add a diagnostic to be emitted if and when this function is codegen'ed.
-  void addDeferredDiag(PartialDiagnosticAt PD);
-
-  /// Gets this object's list of deferred diagnostics, if there are any.
-  ///
-  /// Although this is logically const, it clears our list of deferred diags.
-  std::vector<PartialDiagnosticAt> takeDeferredDiags() const;
-
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) {
Index: cfe/trunk/include/clang/Sema/Sema.h
===================================================================
--- cfe/trunk/include/clang/Sema/Sema.h
+++ cfe/trunk/include/clang/Sema/Sema.h
@@ -9245,6 +9245,30 @@
   /// before incrementing, so you can emit an error.
   bool PopForceCUDAHostDevice();
 
+  /// Diagnostics that are emitted only if we discover that the given function
+  /// must be codegen'ed.  Because handling these correctly adds overhead to
+  /// compilation, this is currently only enabled for CUDA compilations.
+  llvm::DenseMap<const FunctionDecl *, std::vector<PartialDiagnosticAt>>
+      CUDADeferredDiags;
+
+  /// Raw encodings of SourceLocations for which CheckCUDACall has emitted a
+  /// (maybe deferred) "bad call" diagnostic.  We use this to avoid emitting the
+  /// same deferred diag twice.
+  llvm::DenseSet<unsigned> LocsWithCUDACallDiags;
+
+  /// The set of CUDA functions that we've discovered must be emitted by tracing
+  /// the call graph.  Functions that we can tell a priori must be emitted
+  /// aren't added to this set.
+  llvm::DenseSet<FunctionDecl *> CUDAKnownEmittedFns;
+
+  /// A partial call graph maintained during CUDA compilation to support
+  /// deferred diagnostics.  Specifically, functions are only added here if, at
+  /// the time they're added, they are not known-emitted.  As soon as we
+  /// discover that a function is known-emitted, we remove it and everything it
+  /// transitively calls from this set and add those functions to
+  /// CUDAKnownEmittedFns.
+  llvm::DenseMap<FunctionDecl *, llvm::SetVector<FunctionDecl *>> CUDACallGraph;
+
   /// Diagnostic builder for CUDA errors which may or may not be deferred.
   ///
   /// In CUDA, there exist constructs (e.g. variable-length arrays, try/catch)
@@ -9298,12 +9322,15 @@
 
   private:
     struct PartialDiagnosticInfo {
-      PartialDiagnosticInfo(SourceLocation Loc, PartialDiagnostic PD,
+      PartialDiagnosticInfo(Sema &S, SourceLocation Loc, PartialDiagnostic PD,
                             FunctionDecl *Fn)
-          : Loc(Loc), PD(std::move(PD)), Fn(Fn) {}
+          : S(S), Loc(Loc), PD(std::move(PD)), Fn(Fn) {}
 
-      ~PartialDiagnosticInfo() { Fn->addDeferredDiag({Loc, std::move(PD)}); }
+      ~PartialDiagnosticInfo() {
+        S.CUDADeferredDiags[Fn].push_back({Loc, std::move(PD)});
+      }
 
+      Sema &S;
       SourceLocation Loc;
       PartialDiagnostic PD;
       FunctionDecl *Fn;
@@ -9322,8 +9349,8 @@
   /// - If CurContext is a __device__ or __global__ function, emits the
   ///   diagnostics immediately.
   /// - If CurContext is a __host__ __device__ function and we are compiling for
-  ///   the device, creates a deferred diagnostic which is emitted if and when
-  ///   the function is codegen'ed.
+  ///   the device, creates a diagnostic which is emitted if and when we realize
+  ///   that the function will be codegen'ed.
   ///
   /// Example usage:
   ///
@@ -9397,12 +9424,6 @@
   void maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *FD,
                                    const LookupResult &Previous);
 
-private:
-  /// Raw encodings of SourceLocations for which CheckCUDACall has emitted a
-  /// (maybe deferred) "bad call" diagnostic.  We use this to avoid emitting the
-  /// same deferred diag twice.
-  llvm::DenseSet<unsigned> LocsWithCUDACallDiags;
-
 public:
   /// Check whether we're allowed to call Callee from the current context.
   ///
Index: cfe/trunk/lib/AST/Decl.cpp
===================================================================
--- cfe/trunk/lib/AST/Decl.cpp
+++ cfe/trunk/lib/AST/Decl.cpp
@@ -3473,20 +3473,6 @@
   return 0;
 }
 
-void FunctionDecl::addDeferredDiag(PartialDiagnosticAt PD) {
-  getASTContext().getDeferredDiags()[this].push_back(std::move(PD));
-}
-
-std::vector<PartialDiagnosticAt> FunctionDecl::takeDeferredDiags() const {
-  auto &DD = getASTContext().getDeferredDiags();
-  auto It = DD.find(this);
-  if (It == DD.end())
-    return {};
-  auto Ret = std::move(It->second);
-  DD.erase(It);
-  return Ret;
-}
-
 //===----------------------------------------------------------------------===//
 // FieldDecl Implementation
 //===----------------------------------------------------------------------===//
Index: cfe/trunk/lib/CodeGen/CodeGenModule.h
===================================================================
--- cfe/trunk/lib/CodeGen/CodeGenModule.h
+++ cfe/trunk/lib/CodeGen/CodeGenModule.h
@@ -490,10 +490,6 @@
   /// MDNodes.
   llvm::DenseMap<QualType, llvm::Metadata *> MetadataIdMap;
 
-  /// Diags gathered from FunctionDecl::takeDeferredDiags().  Emitted at the
-  /// very end of codegen.
-  std::vector<std::pair<SourceLocation, PartialDiagnostic>> DeferredDiags;
-
 public:
   CodeGenModule(ASTContext &C, const HeaderSearchOptions &headersearchopts,
                 const PreprocessorOptions &ppopts,
Index: cfe/trunk/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/CodeGenModule.cpp
+++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp
@@ -499,19 +499,6 @@
   EmitVersionIdentMetadata();
 
   EmitTargetMetadata();
-
-  // Emit any deferred diagnostics gathered during codegen.  We didn't emit them
-  // when we first discovered them because that would have halted codegen,
-  // preventing us from gathering other deferred diags.
-  for (const PartialDiagnosticAt &DiagAt : DeferredDiags) {
-    SourceLocation Loc = DiagAt.first;
-    const PartialDiagnostic &PD = DiagAt.second;
-    DiagnosticBuilder Builder(getDiags().Report(Loc, PD.getDiagID()));
-    PD.Emit(Builder);
-  }
-  // Clear the deferred diags so they don't outlive the ASTContext's
-  // PartialDiagnostic allocator.
-  DeferredDiags.clear();
 }
 
 void CodeGenModule::UpdateCompletedType(const TagDecl *TD) {
@@ -2913,37 +2900,6 @@
                                                  llvm::GlobalValue *GV) {
   const auto *D = cast<FunctionDecl>(GD.getDecl());
 
-  // Emit this function's deferred diagnostics, if none of them are errors.  If
-  // any of them are errors, don't codegen the function, but also don't emit any
-  // of the diagnostics just yet.  Emitting an error during codegen stops
-  // further codegen, and we want to display as many deferred diags as possible.
-  // We'll emit the now twice-deferred diags at the very end of codegen.
-  //
-  // (If a function has both error and non-error diags, we don't emit the
-  // non-error diags here, because order can be significant, e.g. with notes
-  // that follow errors.)
-  auto Diags = D->takeDeferredDiags();
-  if (auto *Templ = D->getPrimaryTemplate()) {
-    auto TemplDiags = Templ->getAsFunction()->takeDeferredDiags();
-    Diags.insert(Diags.end(), TemplDiags.begin(), TemplDiags.end());
-  }
-  bool HasError = llvm::any_of(Diags, [this](const PartialDiagnosticAt &PDAt) {
-    return getDiags().getDiagnosticLevel(PDAt.second.getDiagID(), PDAt.first) >=
-           DiagnosticsEngine::Error;
-  });
-  if (HasError) {
-    DeferredDiags.insert(DeferredDiags.end(),
-                         std::make_move_iterator(Diags.begin()),
-                         std::make_move_iterator(Diags.end()));
-    return;
-  }
-  for (PartialDiagnosticAt &PDAt : Diags) {
-    const SourceLocation &Loc = PDAt.first;
-    const PartialDiagnostic &PD = PDAt.second;
-    DiagnosticBuilder Builder(getDiags().Report(Loc, PD.getDiagID()));
-    PD.Emit(Builder);
-  }
-
   // Compute the function info and LLVM type.
   const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
   llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
Index: cfe/trunk/lib/Sema/SemaCUDA.cpp
===================================================================
--- cfe/trunk/lib/Sema/SemaCUDA.cpp
+++ cfe/trunk/lib/Sema/SemaCUDA.cpp
@@ -499,27 +499,84 @@
     break;
   case K_Deferred:
     assert(Fn && "Must have a function to attach the deferred diag to.");
-    PartialDiagInfo.emplace(Loc, S.PDiag(DiagID), Fn);
+    PartialDiagInfo.emplace(S, Loc, S.PDiag(DiagID), Fn);
     break;
   }
 }
 
+// In CUDA, there are some constructs which may appear in semantically-valid
+// code, but trigger errors if we ever generate code for the function in which
+// they appear.  Essentially every construct you're not allowed to use on the
+// device falls into this category, because you are allowed to use these
+// constructs in a __host__ __device__ function, but only if that function is
+// never codegen'ed on the device.
+//
+// To handle semantic checking for these constructs, we keep track of the set of
+// functions we know will be emitted, either because we could tell a priori that
+// they would be emitted, or because they were transitively called by a
+// known-emitted function.
+//
+// We also keep a partial call graph of which not-known-emitted functions call
+// which other not-known-emitted functions.
+//
+// When we see something which is illegal if the current function is emitted
+// (usually by way of CUDADiagIfDeviceCode, CUDADiagIfHostCode, or
+// CheckCUDACall), we first check if the current function is known-emitted.  If
+// so, we immediately output the diagnostic.
+//
+// Otherwise, we "defer" the diagnostic.  It sits in Sema::CUDADeferredDiags
+// until we discover that the function is known-emitted, at which point we take
+// it out of this map and emit the diagnostic.
+
+// Do we know that we will eventually codegen the given function?
+static bool IsKnownEmitted(Sema &S, FunctionDecl *FD) {
+  // Templates are emitted when they're instantiated.
+  if (FD->isDependentContext())
+    return false;
+
+  // When compiling for device, host functions are never emitted.  Similarly,
+  // when compiling for host, device and global functions are never emitted.
+  // (Technically, we do emit a host-side stub for global functions, but this
+  // doesn't count for our purposes here.)
+  Sema::CUDAFunctionTarget T = S.IdentifyCUDATarget(FD);
+  if (S.getLangOpts().CUDAIsDevice && T == Sema::CFT_Host)
+    return false;
+  if (!S.getLangOpts().CUDAIsDevice &&
+      (T == Sema::CFT_Device || T == Sema::CFT_Global))
+    return false;
+
+  // Externally-visible and similar functions are always emitted.
+  if (S.getASTContext().GetGVALinkageForFunction(FD) > GVA_DiscardableODR)
+    return true;
+
+  // Otherwise, the function is known-emitted if it's in our set of
+  // known-emitted functions.
+  return S.CUDAKnownEmittedFns.count(FD) > 0;
+}
+
 Sema::CUDADiagBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc,
                                                  unsigned DiagID) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
-  CUDADiagBuilder::Kind DiagKind;
-  switch (CurrentCUDATarget()) {
-  case CFT_Global:
-  case CFT_Device:
-    DiagKind = CUDADiagBuilder::K_Immediate;
-    break;
-  case CFT_HostDevice:
-    DiagKind = getLangOpts().CUDAIsDevice ? CUDADiagBuilder::K_Deferred
-                                          : CUDADiagBuilder::K_Nop;
-    break;
-  default:
-    DiagKind = CUDADiagBuilder::K_Nop;
-  }
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (CurrentCUDATarget()) {
+    case CFT_Global:
+    case CFT_Device:
+      return CUDADiagBuilder::K_Immediate;
+    case CFT_HostDevice:
+      // An HD function counts as host code if we're compiling for host, and
+      // device code if we're compiling for device.  Defer any errors in device
+      // mode until the function is known-emitted.
+      if (getLangOpts().CUDAIsDevice) {
+        return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
+                   ? CUDADiagBuilder::K_Immediate
+                   : CUDADiagBuilder::K_Deferred;
+      }
+      return CUDADiagBuilder::K_Nop;
+
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
   return CUDADiagBuilder(DiagKind, Loc, DiagID,
                          dyn_cast<FunctionDecl>(CurContext), *this);
 }
@@ -527,41 +584,119 @@
 Sema::CUDADiagBuilder Sema::CUDADiagIfHostCode(SourceLocation Loc,
                                                unsigned DiagID) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
-  CUDADiagBuilder::Kind DiagKind;
-  switch (CurrentCUDATarget()) {
-  case CFT_Host:
-    DiagKind = CUDADiagBuilder::K_Immediate;
-    break;
-  case CFT_HostDevice:
-    DiagKind = getLangOpts().CUDAIsDevice ? CUDADiagBuilder::K_Nop
-                                          : CUDADiagBuilder::K_Deferred;
-    break;
-  default:
-    DiagKind = CUDADiagBuilder::K_Nop;
-  }
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (CurrentCUDATarget()) {
+    case CFT_Host:
+      return CUDADiagBuilder::K_Immediate;
+    case CFT_HostDevice:
+      // An HD function counts as host code if we're compiling for host, and
+      // device code if we're compiling for device.  Defer any errors in device
+      // mode until the function is known-emitted.
+      if (getLangOpts().CUDAIsDevice)
+        return CUDADiagBuilder::K_Nop;
+
+      return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
+                 ? CUDADiagBuilder::K_Immediate
+                 : CUDADiagBuilder::K_Deferred;
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
   return CUDADiagBuilder(DiagKind, Loc, DiagID,
                          dyn_cast<FunctionDecl>(CurContext), *this);
 }
 
+// Emit any deferred diagnostics for FD and erase them from the map in which
+// they're stored.
+static void EmitDeferredDiags(Sema &S, FunctionDecl *FD) {
+  auto It = S.CUDADeferredDiags.find(FD);
+  if (It == S.CUDADeferredDiags.end())
+    return;
+  for (PartialDiagnosticAt &PDAt : It->second) {
+    const SourceLocation &Loc = PDAt.first;
+    const PartialDiagnostic &PD = PDAt.second;
+    DiagnosticBuilder Builder(S.Diags.Report(Loc, PD.getDiagID()));
+    Builder.setForceEmit();
+    PD.Emit(Builder);
+  }
+  S.CUDADeferredDiags.erase(It);
+}
+
+// Indicate that this function (and thus everything it transtively calls) will
+// be codegen'ed, and emit any deferred diagnostics on this function and its
+// (transitive) callees.
+static void MarkKnownEmitted(Sema &S, FunctionDecl *FD) {
+  // Nothing to do if we already know that FD is emitted.
+  if (IsKnownEmitted(S, FD)) {
+    assert(!S.CUDACallGraph.count(FD));
+    return;
+  }
+
+  // We've just discovered that FD is known-emitted.  Walk our call graph to see
+  // what else we can now discover also must be emitted.
+  llvm::SmallVector<FunctionDecl *, 4> Worklist = {FD};
+  llvm::SmallSet<FunctionDecl *, 4> Seen;
+  Seen.insert(FD);
+  while (!Worklist.empty()) {
+    FunctionDecl *Caller = Worklist.pop_back_val();
+    assert(!IsKnownEmitted(S, Caller) &&
+           "Worklist should not contain known-emitted functions.");
+    S.CUDAKnownEmittedFns.insert(Caller);
+    EmitDeferredDiags(S, Caller);
+
+    // Deferred diags are often emitted on the template itself, so emit those as
+    // well.
+    if (auto *Templ = Caller->getPrimaryTemplate())
+      EmitDeferredDiags(S, Templ->getAsFunction());
+
+    // Add all functions called by Caller to our worklist.
+    auto CGIt = S.CUDACallGraph.find(Caller);
+    if (CGIt == S.CUDACallGraph.end())
+      continue;
+
+    for (FunctionDecl *Callee : CGIt->second) {
+      if (Seen.count(Callee) || IsKnownEmitted(S, Callee))
+        continue;
+      Seen.insert(Callee);
+      Worklist.push_back(Callee);
+    }
+
+    // Caller is now known-emitted, so we no longer need to maintain its list of
+    // callees in CUDACallGraph.
+    S.CUDACallGraph.erase(CGIt);
+  }
+}
+
 bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   assert(Callee && "Callee may not be null.");
+  // FIXME: Is bailing out early correct here?  Should we instead assume that
+  // the caller is a global initializer?
   FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
   if (!Caller)
     return true;
 
-  CUDADiagBuilder::Kind DiagKind;
-  switch (IdentifyCUDAPreference(Caller, Callee)) {
-  case CFP_Never:
-    DiagKind = CUDADiagBuilder::K_Immediate;
-    break;
-  case CFP_WrongSide:
-    assert(Caller && "WrongSide calls require a non-null caller");
-    DiagKind = CUDADiagBuilder::K_Deferred;
-    break;
-  default:
-    DiagKind = CUDADiagBuilder::K_Nop;
-  }
+  bool CallerKnownEmitted = IsKnownEmitted(*this, Caller);
+  if (CallerKnownEmitted)
+    MarkKnownEmitted(*this, Callee);
+  else
+    CUDACallGraph[Caller].insert(Callee);
+
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (IdentifyCUDAPreference(Caller, Callee)) {
+    case CFP_Never:
+      return CUDADiagBuilder::K_Immediate;
+    case CFP_WrongSide:
+      assert(Caller && "WrongSide calls require a non-null caller");
+      // If we know the caller will be emitted, we know this wrong-side call
+      // will be emitted, so it's an immediate error.  Otherwise, defer the
+      // error until we know the caller is emitted.
+      return CallerKnownEmitted ? CUDADiagBuilder::K_Immediate
+                                : CUDADiagBuilder::K_Deferred;
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
 
   // Avoid emitting this error twice for the same location.  Using a hashtable
   // like this is unfortunate, but because we must continue parsing as normal
Index: cfe/trunk/test/Parser/lambda-attr.cu
===================================================================
--- cfe/trunk/test/Parser/lambda-attr.cu
+++ cfe/trunk/test/Parser/lambda-attr.cu
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -fcuda-is-device -verify %s
 
 __attribute__((device)) void device_fn() {}
-__attribute__((device)) void hd_fn() {}
+__attribute__((host, device)) void hd_fn() {}
 
 __attribute__((device)) void device_attr() {
   ([]() __attribute__((device)) { device_fn(); })();
Index: cfe/trunk/test/SemaCUDA/call-host-fn-from-device.cu
===================================================================
--- cfe/trunk/test/SemaCUDA/call-host-fn-from-device.cu
+++ cfe/trunk/test/SemaCUDA/call-host-fn-from-device.cu
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s --std=c++11 -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - -verify
+// RUN: %clang_cc1 %s --std=c++11 -triple nvptx-unknown-unknown -fcuda-is-device \
+// RUN:   -emit-llvm -o /dev/null -verify
 
 // Note: This test won't work with -fsyntax-only, because some of these errors
 // are emitted during codegen.
Index: cfe/trunk/test/SemaCUDA/function-overload.cu
===================================================================
--- cfe/trunk/test/SemaCUDA/function-overload.cu
+++ cfe/trunk/test/SemaCUDA/function-overload.cu
@@ -170,18 +170,35 @@
   DeviceReturnTy ret_d = d();
   DeviceFnPtr fp_cd = cd;
   DeviceReturnTy ret_cd = cd();
+#if !defined(__CUDA_ARCH__)
+  // expected-error@-5 {{reference to __device__ function 'd' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __device__ function 'd' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __device__ function 'cd' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __device__ function 'cd' in __host__ __device__ function}}
+#endif
 
   HostFnPtr fp_h = h;
   HostReturnTy ret_h = h();
   HostFnPtr fp_ch = ch;
   HostReturnTy ret_ch = ch();
+#if defined(__CUDA_ARCH__)
+  // expected-error@-5 {{reference to __host__ function 'h' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __host__ function 'h' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
+#endif
 
   CurrentFnPtr fp_dh = dh;
   CurrentReturnTy ret_dh = dh();
   CurrentFnPtr fp_cdh = cdh;
   CurrentReturnTy ret_cdh = cdh();
 
-  g(); // expected-error {{call to global function g not configured}}
+  g();
+#if defined (__CUDA_ARCH__)
+  // expected-error@-2 {{reference to __global__ function 'g' in __host__ __device__ function}}
+#else
+  // expected-error@-4 {{call to global function g not configured}}
+#endif
 }
 
 // Test for address of overloaded function resolution in the global context.
@@ -297,7 +314,11 @@
 
 // If we have a mix of HD and H-only or D-only candidates in the overload set,
 // normal C++ overload resolution rules apply first.
-template <typename T> TemplateReturnTy template_vs_hd_function(T arg) {
+template <typename T> TemplateReturnTy template_vs_hd_function(T arg)
+#ifdef __CUDA_ARCH__
+//expected-note@-2 {{declared here}}
+#endif
+{
   return TemplateReturnTy();
 }
 __host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) {
@@ -307,6 +328,9 @@
 __host__ __device__ void test_host_device_calls_hd_template() {
   HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
   TemplateReturnTy ret2 = template_vs_hd_function(1);
+#ifdef __CUDA_ARCH__
+  // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
+#endif
 }
 
 __host__ void test_host_calls_hd_template() {
@@ -326,14 +350,30 @@
 // side of compilation.
 __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
 __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
+#ifndef __CUDA_ARCH__
+  // expected-note@-3 {{'device_only_function' declared here}}
+  // expected-note@-3 {{'device_only_function' declared here}}
+#endif
 __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
 __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
+#ifdef __CUDA_ARCH__
+  // expected-note@-3 {{'host_only_function' declared here}}
+  // expected-note@-3 {{'host_only_function' declared here}}
+#endif
 
 __host__ __device__ void test_host_device_single_side_overloading() {
   DeviceReturnTy ret1 = device_only_function(1);
   DeviceReturnTy2 ret2 = device_only_function(1.0f);
+#ifndef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+#endif
   HostReturnTy ret3 = host_only_function(1);
   HostReturnTy2 ret4 = host_only_function(1.0f);
+#ifdef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+#endif
 }
 
 // Verify that we allow overloading function templates.
Index: cfe/trunk/test/SemaCUDA/method-target.cu
===================================================================
--- cfe/trunk/test/SemaCUDA/method-target.cu
+++ cfe/trunk/test/SemaCUDA/method-target.cu
@@ -29,7 +29,7 @@
 // Test 3: device method called from host function
 
 struct S3 {
-  __device__ void method() {} // expected-note {{'method' declared here}};
+  __device__ void method() {} // expected-note {{'method' declared here}}
 };
 
 void foo3(S3& s) {
@@ -40,11 +40,11 @@
 // Test 4: device method called from host&device function
 
 struct S4 {
-  __device__ void method() {}
+  __device__ void method() {}  // expected-note {{'method' declared here}}
 };
 
 __host__ __device__ void foo4(S4& s) {
-  s.method();
+  s.method(); // expected-error {{reference to __device__ function 'method' in __host__ __device__ function}}
 }
 
 //------------------------------------------------------------------------------
Index: cfe/trunk/test/SemaCUDA/reference-to-kernel-fn.cu
===================================================================
--- cfe/trunk/test/SemaCUDA/reference-to-kernel-fn.cu
+++ cfe/trunk/test/SemaCUDA/reference-to-kernel-fn.cu
@@ -18,10 +18,7 @@
 __host__ __device__ fn_ptr_t get_ptr_hd() {
   return kernel;
 #ifdef DEVICE
-  // This emits a deferred error on the device, but we don't catch it in this
-  // file because the non-deferred error below precludes this.
-
-  // FIXME-expected-error@-2 {{reference to __global__ function}}
+  // expected-error@-2 {{reference to __global__ function}}
 #endif
 }
 __host__ fn_ptr_t get_ptr_h() {