Index: cfe/trunk/include/clang/Sema/Sema.h
===================================================================
--- cfe/trunk/include/clang/Sema/Sema.h
+++ cfe/trunk/include/clang/Sema/Sema.h
@@ -9252,10 +9252,10 @@
   llvm::DenseMap<const FunctionDecl *, std::vector<PartialDiagnosticAt>>
       CUDADeferredDiags;
 
-  /// Raw encodings of SourceLocations for which CheckCUDACall has emitted a
-  /// (maybe deferred) "bad call" diagnostic.  We use this to avoid emitting the
-  /// same deferred diag twice.
-  llvm::DenseSet<unsigned> LocsWithCUDACallDiags;
+  /// FunctionDecls plus raw encodings of SourceLocations for which
+  /// CheckCUDACall has emitted a (maybe deferred) "bad call" diagnostic.  We
+  /// use this to avoid emitting the same deferred diag twice.
+  llvm::DenseSet<std::pair<FunctionDecl *, unsigned>> LocsWithCUDACallDiags;
 
   /// The set of CUDA functions that we've discovered must be emitted by tracing
   /// the call graph.  Functions that we can tell a priori must be emitted
Index: cfe/trunk/lib/Sema/SemaCUDA.cpp
===================================================================
--- cfe/trunk/lib/Sema/SemaCUDA.cpp
+++ cfe/trunk/lib/Sema/SemaCUDA.cpp
@@ -714,20 +714,22 @@
     }
   }();
 
+  if (DiagKind == CUDADiagBuilder::K_Nop)
+    return true;
+
   // Avoid emitting this error twice for the same location.  Using a hashtable
   // like this is unfortunate, but because we must continue parsing as normal
   // after encountering a deferred error, it's otherwise very tricky for us to
   // ensure that we only emit this deferred error once.
-  if (!LocsWithCUDACallDiags.insert(Loc.getRawEncoding()).second)
+  if (!LocsWithCUDACallDiags.insert({Caller, Loc.getRawEncoding()}).second)
     return true;
 
-  bool IsImmediateErr =
-      CUDADiagBuilder(DiagKind, Loc, diag::err_ref_bad_target, Caller, *this)
+  CUDADiagBuilder(DiagKind, Loc, diag::err_ref_bad_target, Caller, *this)
       << IdentifyCUDATarget(Callee) << Callee << IdentifyCUDATarget(Caller);
   CUDADiagBuilder(DiagKind, Callee->getLocation(), diag::note_previous_decl,
                   Caller, *this)
       << Callee;
-  return !IsImmediateErr;
+  return DiagKind != CUDADiagBuilder::K_Immediate;
 }
 
 void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
Index: cfe/trunk/test/SemaCUDA/bad-calls-on-same-line.cu
===================================================================
--- cfe/trunk/test/SemaCUDA/bad-calls-on-same-line.cu
+++ cfe/trunk/test/SemaCUDA/bad-calls-on-same-line.cu
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// The hd function template is instantiated three times.
+//
+// Two of those instantiations call a device function, which is an error when
+// compiling for host.  Clang should report both errors.
+
+#include "Inputs/cuda.h"
+
+template <typename T>
+struct Selector {};
+
+template <>
+struct Selector<int> {
+  __host__ void f() {}
+};
+
+template <>
+struct Selector<float> {
+  __device__ void f() {} // expected-note {{declared here}}
+};
+
+template <>
+struct Selector<double> {
+  __device__ void f() {} // expected-note {{declared here}}
+};
+
+template <typename T>
+inline __host__ __device__ void hd() {
+  Selector<T>().f();
+  // expected-error@-1 {{reference to __device__ function}}
+  // expected-error@-2 {{reference to __device__ function}}
+}
+
+void host_fn() {
+  hd<int>();
+  hd<double>();  // expected-note {{function template specialization 'hd<double>'}}
+  hd<float>();  // expected-note {{function template specialization 'hd<float>'}}
+}