Index: clang/lib/Sema/SemaCUDA.cpp =================================================================== --- clang/lib/Sema/SemaCUDA.cpp +++ clang/lib/Sema/SemaCUDA.cpp @@ -790,9 +790,12 @@ // If the caller is known-emitted, mark the callee as known-emitted. // Otherwise, mark the call in our call graph so we can traverse it later. bool CallerKnownEmitted = IsKnownEmitted(*this, Caller); - if (CallerKnownEmitted) - MarkKnownEmitted(*this, Caller, Callee, Loc); - else { + if (CallerKnownEmitted) { + // Host-side references to a __global__ function refer to the stub, so the + // function itself is never emitted and therefore should not be marked. + if (getLangOpts().CUDAIsDevice || IdentifyCUDATarget(Callee) != CFT_Global) + MarkKnownEmitted(*this, Caller, Callee, Loc); + } else { // If we have // host fn calls kernel fn calls host+device, // the HD function does not get instantiated on the host. We model this by Index: clang/test/SemaCUDA/call-device-fn-from-host.cu =================================================================== --- clang/test/SemaCUDA/call-device-fn-from-host.cu +++ clang/test/SemaCUDA/call-device-fn-from-host.cu @@ -83,3 +83,10 @@ __host__ __device__ void fn_ptr_template() { auto* ptr = &device_fn; // Not an error because the template isn't instantiated. } + +// Launching a kernel from a host function does not result in code generation +// for it, so calling HD function which calls a D function should not trigger +// errors. +static __host__ __device__ void hd_func() { device_fn(); } +__global__ void kernel() { hd_func(); } +void host_func(void) { kernel<<<1, 1>>>(); }