Index: lib/AST/ASTContext.cpp =================================================================== --- lib/AST/ASTContext.cpp +++ lib/AST/ASTContext.cpp @@ -8336,7 +8336,19 @@ // Constructors and destructors are required. if (FD->hasAttr() || FD->hasAttr()) return true; - + + // Force all CUDA kernels to be emitted on device side. + // Otherwise, templated kernels may never be emitted as they are + // only used from host-side code which we never emit on device + // side and which therefore would never trigger us to emit + // device-side kernel it might've instantiated. The trade-off is + // that emitting all kernels is over-conservative and we may emit + // more of them than necessary. If excess of generated GPU code + // becomes a problem we can revisit this. + if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice && + FD->hasAttr()) + return true; + // The key function for a class is required. This rule only comes // into play when inline functions can be key functions, though. if (getTargetInfo().getCXXABI().canKeyFunctionBeInline()) { Index: test/CodeGenCUDA/ptx-kernels.cu =================================================================== --- test/CodeGenCUDA/ptx-kernels.cu +++ test/CodeGenCUDA/ptx-kernels.cu @@ -1,3 +1,7 @@ +// Make sure that __global__ functions are emitted along with correct +// annotations and are added to @llvm.used to prevent their elimination. +// REQUIRES: nvptx-registered-target +// // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s #include "Inputs/cuda.h" @@ -13,4 +17,10 @@ device_function(); } +// Make sure host-instantiated kernels are preserved on device side. +template __global__ void templated_kernel(T param) {} +// CHECK-LABEL: define linkonce_odr void @_Z16templated_kernelIiEvT_ +void host_function() { templated_kernel<<<0,0>>>(0); } + // CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1} +// CHECK: !{{[0-9]+}} = !{void (i32)* @_Z16templated_kernelIiEvT_, !"kernel", i32 1}