Index: lib/AST/ASTContext.cpp
===================================================================
--- lib/AST/ASTContext.cpp
+++ lib/AST/ASTContext.cpp
@@ -8336,7 +8336,19 @@
     // Constructors and destructors are required.
     if (FD->hasAttr<ConstructorAttr>() || FD->hasAttr<DestructorAttr>())
       return true;
-    
+
+    // Force all CUDA kernels to be emitted on device side.
+    // Otherwise, templated kernels may never be emitted as they are
+    // only used from host-side code which we never emit on device
+    // side and which therefore would never trigger us to emit
+    // device-side kernel it might've instantiated. The trade-off is
+    // that emitting all kernels is over-conservative and we may emit
+    // more of them than necessary. If excess of generated GPU code
+    // becomes a problem we can revisit this.
+    if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice &&
+        FD->hasAttr<CUDAGlobalAttr>())
+      return true;
+
     // The key function for a class is required.  This rule only comes
     // into play when inline functions can be key functions, though.
     if (getTargetInfo().getCXXABI().canKeyFunctionBeInline()) {
Index: test/CodeGenCUDA/ptx-kernels.cu
===================================================================
--- test/CodeGenCUDA/ptx-kernels.cu
+++ test/CodeGenCUDA/ptx-kernels.cu
@@ -1,3 +1,7 @@
+// Make sure that __global__ functions are emitted along with correct
+// annotations and are added to @llvm.used to prevent their elimination.
+// REQUIRES: nvptx-registered-target
+//
 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
 
 #include "Inputs/cuda.h"
@@ -13,4 +17,10 @@
   device_function();
 }
 
+// Make sure host-instantiated kernels are preserved on device side.
+template <typename T> __global__ void templated_kernel(T param) {}
+// CHECK-LABEL: define linkonce_odr void @_Z16templated_kernelIiEvT_
+void host_function() { templated_kernel<<<0,0>>>(0); }
+
 // CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1}
+// CHECK: !{{[0-9]+}} = !{void (i32)* @_Z16templated_kernelIiEvT_, !"kernel", i32 1}