Index: lib/CodeGen/CodeGenModule.h =================================================================== --- lib/CodeGen/CodeGenModule.h +++ lib/CodeGen/CodeGenModule.h @@ -301,6 +301,11 @@ /// yet. std::map DeferredDecls; + /// Contains all GlobalDecls for function definitions that do not + /// match current compilation mode. We'll emit these as-needed if + /// there's no suitable mode-matching function. + std::map DeferredCudaDecls; + /// This is a list of deferred decls which we have seen that *are* actually /// referenced. These get code generated when the module is done. struct DeferredGlobal { @@ -1179,6 +1184,9 @@ /// Emit any needed decls for which code generation was deferred. void EmitDeferred(); + /// Emit any needed CUDA decls for which code generation was deferred. + void EmitDeferredCudaDecls(); + /// Call replaceAllUsesWith on all pairs in Replacements. void applyReplacements(); Index: lib/CodeGen/CodeGenModule.cpp =================================================================== --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -381,6 +381,8 @@ if (CoverageMapping) CoverageMapping->emit(); emitLLVMUsed(); + if (LangOpts.CUDA) + EmitDeferredCudaDecls(); if (CodeGenOpts.Autolink && (Context.getLangOpts().Modules || !LinkerOptionsMetadata.empty())) { @@ -1143,6 +1145,34 @@ LinkerOptionsMetadata)); } +void CodeGenModule::EmitDeferredCudaDecls() { + assert(DeferredDeclsToEmit.empty() && "Unexpected deferred decls."); + + // Check whether any of deferred CUDA decls are referred to by the + // code in current TU, move them to the list of deferred decls to + // emit and call EmitDeferred() to emit them. The decls we emit may + // create more unresolved references, so we continue the process + // until there are no more references we can resolve. + bool NeedToEmit; + do { + NeedToEmit = false; + for (auto I = DeferredCudaDecls.begin(), E = DeferredCudaDecls.end(); + I != E;) + if (llvm::GlobalValue *DGV = GetGlobalValue(I->first)) { + if (DGV->isDeclaration()) { + addDeferredDeclToEmit(DGV, I->second); + NeedToEmit = true; + } + I = DeferredCudaDecls.erase(I); + continue; + } else + ++I; + + if (NeedToEmit) + EmitDeferred(); + } while (NeedToEmit); +} + void CodeGenModule::EmitDeferred() { // Emit code for any potentially referenced deferred decls. Since a // previously unused static decl may become used during the generation of code @@ -1417,18 +1447,28 @@ // If this is CUDA, be selective about which declarations we emit. if (LangOpts.CUDA) { + bool GlobalMatchesCudaMode = true; if (LangOpts.CUDAIsDevice) { if (!Global->hasAttr() && !Global->hasAttr() && !Global->hasAttr() && !Global->hasAttr()) - return; + GlobalMatchesCudaMode = false; } else { if (!Global->hasAttr() && ( Global->hasAttr() || Global->hasAttr() || Global->hasAttr())) - return; + GlobalMatchesCudaMode = false; + } + + if (!GlobalMatchesCudaMode) { + if (getLangOpts().CUDADisableTargetCallChecks) { + const auto *FD = dyn_cast(Global); + if (FD && FD->doesThisDeclarationHaveABody()) + DeferredCudaDecls[getMangledName(GD)] = GD; + } + return; } } Index: test/CodeGenCUDA/cross-call.cu =================================================================== --- /dev/null +++ test/CodeGenCUDA/cross-call.cu @@ -0,0 +1,48 @@ +// Check handling of code generation for calls crossing host/device boundary. +// Calls crossing host/device boundary must prefer overload variant that +// matches current compilation mode. Make sure that we do emit intermediary +// functions, whether they are host or device. + +// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \ +// RUN: -fcuda-disable-target-call-checks -fcuda-target-overloads -o - %s \ +// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s +// +// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -emit-llvm \ +// RUN: -fcuda-disable-target-call-checks -fcuda-target-overloads \ +// RUN: -fcuda-is-device -o - %s \ +// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s + +#include "Inputs/cuda.h" + +static __device__ int cross_leaf() { return 41; }; +static __host__ int cross_leaf() { return 42; }; + +// Here we should pick leaf function that matches compilation mode. +static __host__ int cross_gate() { return cross_leaf(); } + +// two levels of host/device cross-calls to verify that we can deal +// with cross-calls in both directions in all compilation modes. +static __host__ int cross_hbridge() { return cross_gate(); } +static __device__ int cross_dbridge() { return cross_gate(); } +static __host__ int cross_hbridge2() { return cross_dbridge(); } +static __device__ int cross_dbridge2() { return cross_hbridge(); } + +__host__ int cross_host() { return cross_dbridge2(); } +__device__ int cross_device() { return cross_hbridge2(); } + +// Make sure we only emit globals for current compilation mode. +// CHECK-HOST: define i32 @_Z10cross_hostv +// CHECK-DEVICE-NOT: define i32 @_Z10cross_hostv +// CHECK-DEVICE: define i32 @_Z12cross_devicev +// CHECK-HOST-NOT: define i32 @_Z12cross_devicev + +// .. but allow non-matching ones if they are used. +// CHECK-HOST-DAG: define internal i32 @_ZL13cross_hbridgev() +// CHECK-HOST-DAG: define internal i32 @_ZL14cross_dbridge2v() +// CHECK-DEVICE-DAG: define internal i32 @_ZL13cross_dbridgev() +// CHECK-DEVICE-DAG: define internal i32 @_ZL14cross_hbridge2v() + +// .. and that we pick the leaf function that matches compilation mode. +// CHECK-BOTH: define internal i32 @_ZL10cross_leafv() +// CHECK-DEVICE: ret i32 41 +// CHECK-HOST: ret i32 42 Index: test/CodeGenCUDA/host-device-calls-host.cu =================================================================== --- test/CodeGenCUDA/host-device-calls-host.cu +++ test/CodeGenCUDA/host-device-calls-host.cu @@ -1,4 +1,15 @@ -// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-allow-host-calls-from-host-device -fcuda-is-device -Wno-cuda-compat -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device \ +// RUN: -fcuda-allow-host-calls-from-host-device \ +// RUN: -Wno-cuda-compat -emit-llvm -o - \ +// RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NORMAL + +// Enabling target overloads and disabling target call checks allows +// cross-calling between host/device. We expect to emit IR for used +// host functions in this case. +// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device \ +// RUN: -fcuda-target-overloads -fcuda-disable-target-call-checks \ +// RUN: -Wno-cuda-compat -emit-llvm -o - \ +// RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HDMIX #include "Inputs/cuda.h" @@ -12,7 +23,8 @@ host_function(); } -// CHECK: declare void @host_function +// CHECK-NORMAL-LABEL: declare void @host_function +// CHECK-HDMIX-LABEL: define void @host_function // CHECK-LABEL: define void @hd_function_b extern "C"