Index: cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td
===================================================================
--- cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td
+++ cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6734,7 +6734,7 @@
   "conflicting __device__ function declared here">;
 def err_cuda_device_exceptions : Error<
   "cannot use '%0' in "
-  "%select{__device__|__global__|__host__|__host__ __device__}1 function %2">;
+  "%select{__device__|__global__|__host__|__host__ __device__}1 function">;
 def err_dynamic_var_init : Error<
     "dynamic initialization is not supported for "
     "__device__, __constant__, and __shared__ variables.">;
Index: cfe/trunk/include/clang/Sema/Sema.h
===================================================================
--- cfe/trunk/include/clang/Sema/Sema.h
+++ cfe/trunk/include/clang/Sema/Sema.h
@@ -9245,6 +9245,100 @@
   /// before incrementing, so you can emit an error.
   bool PopForceCUDAHostDevice();
 
+  /// Diagnostic builder for CUDA errors which may or may not be deferred.
+  ///
+  /// In CUDA, there exist constructs (e.g. variable-length arrays, try/catch)
+  /// which are not allowed to appear inside __device__ functions and are
+  /// allowed to appear in __host__ __device__ functions only if the host+device
+  /// function is never codegen'ed.
+  ///
+  /// To handle this, we use the notion of "deferred diagnostics", where we
+  /// attach a diagnostic to a FunctionDecl that's emitted iff it's codegen'ed.
+  ///
+  /// This class lets you emit either a regular diagnostic, a deferred
+  /// diagnostic, or no diagnostic at all, according to an argument you pass to
+  /// its constructor, thus simplifying the process of creating these "maybe
+  /// deferred" diagnostics.
+  class CUDADiagBuilder {
+  public:
+    enum Kind {
+      /// Emit no diagnostics.
+      K_Nop,
+      /// Emit the diagnostic immediately (i.e., behave like Sema::Diag()).
+      K_Immediate,
+      /// Create a deferred diagnostic, which is emitted only if the function
+      /// it's attached to is codegen'ed.
+      K_Deferred
+    };
+
+    CUDADiagBuilder(Kind K, SourceLocation Loc, unsigned DiagID,
+                    FunctionDecl *Fn, Sema &S);
+
+    /// Convertible to bool: True if we immediately emitted an error, false if
+    /// we didn't emit an error or we created a deferred error.
+    ///
+    /// Example usage:
+    ///
+    ///   if (CUDADiagBuilder(...) << foo << bar)
+    ///     return ExprError();
+    ///
+    /// But see CUDADiagIfDeviceCode() and CUDADiagIfHostCode() -- you probably
+    /// want to use these instead of creating a CUDADiagBuilder yourself.
+    operator bool() const { return ImmediateDiagBuilder.hasValue(); }
+
+    template <typename T>
+    friend const CUDADiagBuilder &operator<<(const CUDADiagBuilder &Diag,
+                                             const T &Value) {
+      if (Diag.ImmediateDiagBuilder.hasValue())
+        *Diag.ImmediateDiagBuilder << Value;
+      else if (Diag.PartialDiagInfo.hasValue())
+        Diag.PartialDiagInfo->PD << Value;
+      return Diag;
+    }
+
+  private:
+    struct PartialDiagnosticInfo {
+      PartialDiagnosticInfo(SourceLocation Loc, PartialDiagnostic PD,
+                            FunctionDecl *Fn)
+          : Loc(Loc), PD(std::move(PD)), Fn(Fn) {}
+
+      ~PartialDiagnosticInfo() { Fn->addDeferredDiag({Loc, std::move(PD)}); }
+
+      SourceLocation Loc;
+      PartialDiagnostic PD;
+      FunctionDecl *Fn;
+    };
+
+    // Invariant: At most one of these Optionals has a value.
+    // FIXME: Switch these to a Variant once that exists.
+    llvm::Optional<Sema::SemaDiagnosticBuilder> ImmediateDiagBuilder;
+    llvm::Optional<PartialDiagnosticInfo> PartialDiagInfo;
+  };
+
+  /// Creates a CUDADiagBuilder that emits the diagnostic if the current context
+  /// is "used as device code".
+  ///
+  /// - If CurContext is a __host__ function, does not emit any diagnostics.
+  /// - If CurContext is a __device__ or __global__ function, emits the
+  ///   diagnostics immediately.
+  /// - If CurContext is a __host__ __device__ function and we are compiling for
+  ///   the device, creates a deferred diagnostic which is emitted if and when
+  ///   the function is codegen'ed.
+  ///
+  /// Example usage:
+  ///
+  ///  // Variable-length arrays are not allowed in CUDA device code.
+  ///  if (CUDADiagIfDeviceCode(Loc, diag::err_cuda_vla) << CurrentCUDATarget())
+  ///    return ExprError();
+  ///  // Otherwise, continue parsing as normal.
+  CUDADiagBuilder CUDADiagIfDeviceCode(SourceLocation Loc, unsigned DiagID);
+
+  /// Creates a CUDADiagBuilder that emits the diagnostic if the current context
+  /// is "used as host code".
+  ///
+  /// Same as CUDADiagIfDeviceCode, with "host" and "device" switched.
+  CUDADiagBuilder CUDADiagIfHostCode(SourceLocation Loc, unsigned DiagID);
+
   enum CUDAFunctionTarget {
     CFT_Device,
     CFT_Global,
@@ -9253,8 +9347,18 @@
     CFT_InvalidTarget
   };
 
+  /// Determines whether the given function is a CUDA device/host/kernel/etc.
+  /// function.
+  ///
+  /// Use this rather than examining the function's attributes yourself -- you
+  /// will get it wrong.  Returns CFT_Host if D is null.
   CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D);
 
+  /// Gets the CUDA target for the current context.
+  CUDAFunctionTarget CurrentCUDATarget() {
+    return IdentifyCUDATarget(dyn_cast<FunctionDecl>(CurContext));
+  }
+
   // CUDA function call preference. Must be ordered numerically from
   // worst to best.
   enum CUDAFunctionPreference {
@@ -9295,9 +9399,9 @@
 
 private:
   /// Raw encodings of SourceLocations for which CheckCUDACall has emitted a
-  /// deferred "bad call" diagnostic.  We use this to avoid emitting the same
-  /// deferred diag twice.
-  llvm::DenseSet<unsigned> LocsWithCUDACallDeferredDiags;
+  /// (maybe deferred) "bad call" diagnostic.  We use this to avoid emitting the
+  /// same deferred diag twice.
+  llvm::DenseSet<unsigned> LocsWithCUDACallDiags;
 
 public:
   /// Check whether we're allowed to call Callee from the current context.
@@ -9316,21 +9420,6 @@
   /// - Otherwise, returns true without emitting any diagnostics.
   bool CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee);
 
-  /// Check whether a 'try' or 'throw' expression is allowed within the current
-  /// context, and raise an error or create a deferred error, as appropriate.
-  ///
-  /// 'try' and 'throw' are never allowed in CUDA __device__ functions, and are
-  /// allowed in __host__ __device__ functions only if those functions are never
-  /// codegen'ed for the device.
-  ///
-  /// ExprTy should be the string "try" or "throw", as appropriate.
-  bool CheckCUDAExceptionExpr(SourceLocation Loc, StringRef ExprTy);
-
-  /// Check whether it's legal for us to create a variable-length array in the
-  /// current context.  Returns true if the VLA is OK; returns false and emits
-  /// an error otherwise.
-  bool CheckCUDAVLA(SourceLocation Loc);
-
   /// Set __device__ or __host__ __device__ attributes on the given lambda
   /// operator() method.
   ///
Index: cfe/trunk/lib/Sema/SemaCUDA.cpp
===================================================================
--- cfe/trunk/lib/Sema/SemaCUDA.cpp
+++ cfe/trunk/lib/Sema/SemaCUDA.cpp
@@ -18,6 +18,7 @@
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaDiagnostic.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
@@ -55,6 +56,10 @@
 
 /// IdentifyCUDATarget - Determine the CUDA compilation target for this function
 Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D) {
+  // Code that lives outside a function is run on the host.
+  if (D == nullptr)
+    return CFT_Host;
+
   if (D->hasAttr<CUDAInvalidTargetAttr>())
     return CFT_InvalidTarget;
 
@@ -108,9 +113,8 @@
 Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
                              const FunctionDecl *Callee) {
   assert(Callee && "Callee must be valid.");
+  CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller);
   CUDAFunctionTarget CalleeTarget = IdentifyCUDATarget(Callee);
-  CUDAFunctionTarget CallerTarget =
-      (Caller != nullptr) ? IdentifyCUDATarget(Caller) : Sema::CFT_Host;
 
   // If one of the targets is invalid, the check always fails, no matter what
   // the other target is.
@@ -484,88 +488,95 @@
   NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
 }
 
-bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
-  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
-  assert(Callee && "Callee may not be null.");
-  FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
-  if (!Caller)
-    return true;
-
-  Sema::CUDAFunctionPreference Pref = IdentifyCUDAPreference(Caller, Callee);
-  if (Pref == Sema::CFP_Never) {
-    Diag(Loc, diag::err_ref_bad_target) << IdentifyCUDATarget(Callee) << Callee
-                                        << IdentifyCUDATarget(Caller);
-    Diag(Callee->getLocation(), diag::note_previous_decl) << Callee;
-    return false;
+Sema::CUDADiagBuilder::CUDADiagBuilder(Kind K, SourceLocation Loc,
+                                       unsigned DiagID, FunctionDecl *Fn,
+                                       Sema &S) {
+  switch (K) {
+  case K_Nop:
+    break;
+  case K_Immediate:
+    ImmediateDiagBuilder.emplace(S.Diag(Loc, DiagID));
+    break;
+  case K_Deferred:
+    assert(Fn && "Must have a function to attach the deferred diag to.");
+    PartialDiagInfo.emplace(Loc, S.PDiag(DiagID), Fn);
+    break;
   }
+}
 
-  // Insert into LocsWithCUDADeferredDiags to avoid emitting duplicate deferred
-  // diagnostics for the same location.  Duplicate deferred diags are otherwise
-  // tricky to avoid, because, unlike with regular errors, sema checking
-  // proceeds unhindered when we omit a deferred diagnostic.
-  if (Pref == Sema::CFP_WrongSide &&
-      LocsWithCUDACallDeferredDiags.insert(Loc.getRawEncoding()).second) {
-    // We have to do this odd dance to create our PartialDiagnostic because we
-    // want its storage to be allocated with operator new, not in an arena.
-    PartialDiagnostic ErrPD{PartialDiagnostic::NullDiagnostic()};
-    ErrPD.Reset(diag::err_ref_bad_target);
-    ErrPD << IdentifyCUDATarget(Callee) << Callee << IdentifyCUDATarget(Caller);
-    Caller->addDeferredDiag({Loc, std::move(ErrPD)});
-
-    PartialDiagnostic NotePD{PartialDiagnostic::NullDiagnostic()};
-    NotePD.Reset(diag::note_previous_decl);
-    NotePD << Callee;
-    Caller->addDeferredDiag({Callee->getLocation(), std::move(NotePD)});
-
-    // This is not immediately an error, so return true.  The deferred errors
-    // will be emitted if and when Caller is codegen'ed.
-    return true;
+Sema::CUDADiagBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc,
+                                                 unsigned DiagID) {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  CUDADiagBuilder::Kind DiagKind;
+  switch (CurrentCUDATarget()) {
+  case CFT_Global:
+  case CFT_Device:
+    DiagKind = CUDADiagBuilder::K_Immediate;
+    break;
+  case CFT_HostDevice:
+    DiagKind = getLangOpts().CUDAIsDevice ? CUDADiagBuilder::K_Deferred
+                                          : CUDADiagBuilder::K_Nop;
+    break;
+  default:
+    DiagKind = CUDADiagBuilder::K_Nop;
   }
-  return true;
+  return CUDADiagBuilder(DiagKind, Loc, DiagID,
+                         dyn_cast<FunctionDecl>(CurContext), *this);
 }
 
-bool Sema::CheckCUDAExceptionExpr(SourceLocation Loc, StringRef ExprTy) {
+Sema::CUDADiagBuilder Sema::CUDADiagIfHostCode(SourceLocation Loc,
+                                               unsigned DiagID) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
-  FunctionDecl *CurFn = dyn_cast<FunctionDecl>(CurContext);
-  if (!CurFn)
-    return true;
-  CUDAFunctionTarget Target = IdentifyCUDATarget(CurFn);
-
-  // Raise an error immediately if this is a __global__ or __device__ function.
-  // If it's a __host__ __device__ function, enqueue a deferred error which will
-  // be emitted if the function is codegen'ed for device.
-  if (Target == CFT_Global || Target == CFT_Device) {
-    Diag(Loc, diag::err_cuda_device_exceptions) << ExprTy << Target << CurFn;
-    return false;
-  }
-  if (Target == CFT_HostDevice && getLangOpts().CUDAIsDevice) {
-    PartialDiagnostic ErrPD{PartialDiagnostic::NullDiagnostic()};
-    ErrPD.Reset(diag::err_cuda_device_exceptions);
-    ErrPD << ExprTy << Target << CurFn;
-    CurFn->addDeferredDiag({Loc, std::move(ErrPD)});
-    return false;
+  CUDADiagBuilder::Kind DiagKind;
+  switch (CurrentCUDATarget()) {
+  case CFT_Host:
+    DiagKind = CUDADiagBuilder::K_Immediate;
+    break;
+  case CFT_HostDevice:
+    DiagKind = getLangOpts().CUDAIsDevice ? CUDADiagBuilder::K_Nop
+                                          : CUDADiagBuilder::K_Deferred;
+    break;
+  default:
+    DiagKind = CUDADiagBuilder::K_Nop;
   }
-  return true;
+  return CUDADiagBuilder(DiagKind, Loc, DiagID,
+                         dyn_cast<FunctionDecl>(CurContext), *this);
 }
 
-bool Sema::CheckCUDAVLA(SourceLocation Loc) {
+bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
-  FunctionDecl *CurFn = dyn_cast<FunctionDecl>(CurContext);
-  if (!CurFn)
+  assert(Callee && "Callee may not be null.");
+  FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
+  if (!Caller)
     return true;
-  CUDAFunctionTarget Target = IdentifyCUDATarget(CurFn);
-  if (Target == CFT_Global || Target == CFT_Device) {
-    Diag(Loc, diag::err_cuda_vla) << Target;
-    return false;
-  }
-  if (Target == CFT_HostDevice && getLangOpts().CUDAIsDevice) {
-    PartialDiagnostic ErrPD{PartialDiagnostic::NullDiagnostic()};
-    ErrPD.Reset(diag::err_cuda_vla);
-    ErrPD << Target;
-    CurFn->addDeferredDiag({Loc, std::move(ErrPD)});
-    return false;
-  }
-  return true;
+
+  CUDADiagBuilder::Kind DiagKind;
+  switch (IdentifyCUDAPreference(Caller, Callee)) {
+  case CFP_Never:
+    DiagKind = CUDADiagBuilder::K_Immediate;
+    break;
+  case CFP_WrongSide:
+    assert(Caller && "WrongSide calls require a non-null caller");
+    DiagKind = CUDADiagBuilder::K_Deferred;
+    break;
+  default:
+    DiagKind = CUDADiagBuilder::K_Nop;
+  }
+
+  // Avoid emitting this error twice for the same location.  Using a hashtable
+  // like this is unfortunate, but because we must continue parsing as normal
+  // after encountering a deferred error, it's otherwise very tricky for us to
+  // ensure that we only emit this deferred error once.
+  if (!LocsWithCUDACallDiags.insert(Loc.getRawEncoding()).second)
+    return true;
+
+  bool IsImmediateErr =
+      CUDADiagBuilder(DiagKind, Loc, diag::err_ref_bad_target, Caller, *this)
+      << IdentifyCUDATarget(Callee) << Callee << IdentifyCUDATarget(Caller);
+  CUDADiagBuilder(DiagKind, Callee->getLocation(), diag::note_previous_decl,
+                  Caller, *this)
+      << Callee;
+  return !IsImmediateErr;
 }
 
 void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
Index: cfe/trunk/lib/Sema/SemaExprCXX.cpp
===================================================================
--- cfe/trunk/lib/Sema/SemaExprCXX.cpp
+++ cfe/trunk/lib/Sema/SemaExprCXX.cpp
@@ -685,7 +685,8 @@
 
   // Exceptions aren't allowed in CUDA device code.
   if (getLangOpts().CUDA)
-    CheckCUDAExceptionExpr(OpLoc, "throw");
+    CUDADiagIfDeviceCode(OpLoc, diag::err_cuda_device_exceptions)
+        << "throw" << CurrentCUDATarget();
 
   if (getCurScope() && getCurScope()->isOpenMPSimdDirectiveScope())
     Diag(OpLoc, diag::err_omp_simd_region_cannot_use_stmt) << "throw";
Index: cfe/trunk/lib/Sema/SemaStmt.cpp
===================================================================
--- cfe/trunk/lib/Sema/SemaStmt.cpp
+++ cfe/trunk/lib/Sema/SemaStmt.cpp
@@ -3648,7 +3648,8 @@
 
   // Exceptions aren't allowed in CUDA device code.
   if (getLangOpts().CUDA)
-    CheckCUDAExceptionExpr(TryLoc, "try");
+    CUDADiagIfDeviceCode(TryLoc, diag::err_cuda_device_exceptions)
+        << "try" << CurrentCUDATarget();
 
   if (getCurScope() && getCurScope()->isOpenMPSimdDirectiveScope())
     Diag(TryLoc, diag::err_omp_simd_region_cannot_use_stmt) << "try";
Index: cfe/trunk/lib/Sema/SemaType.cpp
===================================================================
--- cfe/trunk/lib/Sema/SemaType.cpp
+++ cfe/trunk/lib/Sema/SemaType.cpp
@@ -2249,8 +2249,8 @@
     return QualType();
   }
   // CUDA device code doesn't support VLAs.
-  if (getLangOpts().CUDA && T->isVariableArrayType() && !CheckCUDAVLA(Loc))
-    return QualType();
+  if (getLangOpts().CUDA && T->isVariableArrayType())
+    CUDADiagIfDeviceCode(Loc, diag::err_cuda_vla) << CurrentCUDATarget();
 
   // If this is not C99, extwarn about VLA's and C99 array size modifiers.
   if (!getLangOpts().C99) {
Index: cfe/trunk/test/SemaCUDA/exceptions-host-device.cu
===================================================================
--- cfe/trunk/test/SemaCUDA/exceptions-host-device.cu
+++ cfe/trunk/test/SemaCUDA/exceptions-host-device.cu
@@ -14,8 +14,8 @@
   throw NULL;
   try {} catch(void*) {}
 #ifndef HOST
-  // expected-error@-3 {{cannot use 'throw' in __host__ __device__ function 'hd1'}}
-  // expected-error@-3 {{cannot use 'try' in __host__ __device__ function 'hd1'}}
+  // expected-error@-3 {{cannot use 'throw' in __host__ __device__ function}}
+  // expected-error@-3 {{cannot use 'try' in __host__ __device__ function}}
 #endif
 }
 
@@ -31,8 +31,8 @@
   throw NULL;
   try {} catch(void*) {}
 #ifndef HOST
-  // expected-error@-3 {{cannot use 'throw' in __host__ __device__ function 'hd3'}}
-  // expected-error@-3 {{cannot use 'try' in __host__ __device__ function 'hd3'}}
+  // expected-error@-3 {{cannot use 'throw' in __host__ __device__ function}}
+  // expected-error@-3 {{cannot use 'try' in __host__ __device__ function}}
 #endif
 }
 __device__ void call_hd3() { hd3(); }
Index: cfe/trunk/test/SemaCUDA/exceptions.cu
===================================================================
--- cfe/trunk/test/SemaCUDA/exceptions.cu
+++ cfe/trunk/test/SemaCUDA/exceptions.cu
@@ -9,13 +9,13 @@
 }
 __device__ void device() {
   throw NULL;
-  // expected-error@-1 {{cannot use 'throw' in __device__ function 'device'}}
+  // expected-error@-1 {{cannot use 'throw' in __device__ function}}
   try {} catch(void*) {}
-  // expected-error@-1 {{cannot use 'try' in __device__ function 'device'}}
+  // expected-error@-1 {{cannot use 'try' in __device__ function}}
 }
 __global__ void kernel() {
   throw NULL;
-  // expected-error@-1 {{cannot use 'throw' in __global__ function 'kernel'}}
+  // expected-error@-1 {{cannot use 'throw' in __global__ function}}
   try {} catch(void*) {}
-  // expected-error@-1 {{cannot use 'try' in __global__ function 'kernel'}}
+  // expected-error@-1 {{cannot use 'try' in __global__ function}}
 }