Index: clang/include/clang/AST/ASTContext.h =================================================================== --- clang/include/clang/AST/ASTContext.h +++ clang/include/clang/AST/ASTContext.h @@ -1160,6 +1160,10 @@ /// Keep track of CUDA/HIP device-side variables ODR-used by host code. llvm::DenseSet CUDADeviceVarODRUsedByHost; + /// Keep track of CUDA/HIP external kernels or device variables ODR-used by + /// host code. + llvm::DenseSet CUDAExternalDeviceDeclODRUsedByHost; + ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents, SelectorTable &sels, Builtin::Context &builtins, TranslationUnitKind TUKind); Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -579,6 +579,30 @@ } } + // Emit a global array containing all external kernels or device variables + // used by host functions and mark it as used for CUDA/HIP. This is necessary + // to get kernels or device variables in archives linked in even if these + // kernels or device variables are only used in host functions. + if (!Context.CUDAExternalDeviceDeclODRUsedByHost.empty()) { + SmallVector UsedArray; + for (auto D : Context.CUDAExternalDeviceDeclODRUsedByHost) { + GlobalDecl GD; + if (auto *FD = dyn_cast(D)) + GD = GlobalDecl(FD, KernelReferenceKind::Kernel); + else + GD = GlobalDecl(D); + UsedArray.push_back(llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + GetAddrOfGlobal(GD), Int8PtrTy)); + } + + llvm::ArrayType *ATy = llvm::ArrayType::get(Int8PtrTy, UsedArray.size()); + + auto *GV = new llvm::GlobalVariable( + getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage, + llvm::ConstantArray::get(ATy, UsedArray), "hip.used.external"); + addCompilerUsedGlobal(GV); + } + emitLLVMUsed(); if (SanStats) SanStats->finish(); Index: clang/lib/Sema/SemaCUDA.cpp =================================================================== --- clang/lib/Sema/SemaCUDA.cpp +++ clang/lib/Sema/SemaCUDA.cpp @@ -819,8 +819,13 @@ } }(); - if (DiagKind == SemaDiagnosticBuilder::K_Nop) + if (DiagKind == SemaDiagnosticBuilder::K_Nop) { + // For -fgpu-rdc, keep track of external kernels used by host functions. + if (LangOpts.CUDAIsDevice && LangOpts.GPURelocatableDeviceCode && + Callee->hasAttr() && !Callee->isDefined()) + getASTContext().CUDAExternalDeviceDeclODRUsedByHost.insert(Callee); return true; + } // Avoid emitting this error twice for the same location. Using a hashtable // like this is unfortunate, but because we must continue parsing as normal Index: clang/lib/Sema/SemaExpr.cpp =================================================================== --- clang/lib/Sema/SemaExpr.cpp +++ clang/lib/Sema/SemaExpr.cpp @@ -17908,8 +17908,7 @@ } } else if (VarTarget == Sema::CVT_Device && (UserTarget == Sema::CFT_Host || - UserTarget == Sema::CFT_HostDevice) && - !Var->hasExternalStorage()) { + UserTarget == Sema::CFT_HostDevice)) { // Record a CUDA/HIP device side variable if it is ODR-used // by host code. This is done conservatively, when the variable is // referenced in any of the following contexts: @@ -17920,7 +17919,10 @@ // be visible in the device compilation for the compiler to be able to // emit template variables instantiated by host code only and to // externalize the static device side variable ODR-used by host code. - SemaRef.getASTContext().CUDADeviceVarODRUsedByHost.insert(Var); + if (!Var->hasExternalStorage()) + SemaRef.getASTContext().CUDADeviceVarODRUsedByHost.insert(Var); + else if (SemaRef.LangOpts.GPURelocatableDeviceCode) + SemaRef.getASTContext().CUDAExternalDeviceDeclODRUsedByHost.insert(Var); } } Index: clang/test/CodeGenCUDA/host-used-extern.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/host-used-extern.cu @@ -0,0 +1,51 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \ +// RUN: -fgpu-rdc -std=c++11 -emit-llvm -o - -target-cpu gfx906 | FileCheck %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \ +// RUN: -fgpu-rdc -std=c++11 -emit-llvm -o - -target-cpu gfx906 \ +// RUN: | FileCheck -check-prefix=NEG %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \ +// RUN: -std=c++11 -emit-llvm -o - -target-cpu gfx906 \ +// RUN: | FileCheck -check-prefixes=NEG,NORDC %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: @hip.used.external = appending {{.*}}global +// CHECK-DAG: @_Z7kernel1v +// CHECK-DAG: @_Z7kernel4v +// CHECK-DAG: @var1 +// CHECK-LABEL: @llvm.compiler.used = {{.*}} @hip.used.external + +// NEG-NOT: @hip.used.external = {{.*}} @_Z7kernel2v +// NEG-NOT: @hip.used.external = {{.*}} @_Z7kernel3v +// NEG-NOT: @hip.used.external = {{.*}} @var2 +// NEG-NOT: @hip.used.external = {{.*}} @var3 +// NORDC-NOT: @hip.used.external = {{.*}} @_Z7kernel1v +// NORDC-NOT: @hip.used.external = {{.*}} @_Z7kernel4v +// NORDC-NOT: @hip.used.external = {{.*}} @var1 + +__global__ void kernel1(); + +// kernel2 is not marked as used since it is a definition. +__global__ void kernel2() {} + +// kernel3 is not marked as used since it is not called by host function. +__global__ void kernel3(); + +// kernel4 is marked as used even though it is not called. +__global__ void kernel4(); + +extern __device__ int var1; + +__device__ int var2; + +extern __device__ int var3; + +void use(int *p); + +void test() { + kernel1<<<1, 1>>>(); + void *p = (void*)kernel4; + use(&var1); +}