Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -165,6 +165,7 @@ LANGOPT(CUDAIsDevice , 1, 0, "Compiling for CUDA device") LANGOPT(CUDAAllowHostCallsFromHostDevice, 1, 0, "Allow host device functions to call host functions") LANGOPT(CUDADisableTargetCallChecks, 1, 0, "Disable checks for call targets (host, device, etc.)") +LANGOPT(CUDAUsesLibDevice , 1, 0, "Apply Internalize and NVVMReflect passes to linked bitcode.") LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators") LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions") Index: include/clang/Driver/CC1Options.td =================================================================== --- include/clang/Driver/CC1Options.td +++ include/clang/Driver/CC1Options.td @@ -651,6 +651,8 @@ HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">; def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">, HelpText<"Incorporate CUDA device-side binary into host object file.">; +def fcuda_uses_libdevice : Flag<["-"], "fcuda-uses-libdevice">, + HelpText<"Apply Internalize and NVVMReflect passes to linked bitcode.">; } // let Flags = [CC1Option] Index: lib/CodeGen/BackendUtil.cpp =================================================================== --- lib/CodeGen/BackendUtil.cpp +++ lib/CodeGen/BackendUtil.cpp @@ -458,6 +458,8 @@ BackendArgs.push_back("-limit-float-precision"); BackendArgs.push_back(CodeGenOpts.LimitFloatPrecision.c_str()); } + if (LangOpts.CUDA && LangOpts.CUDAIsDevice && LangOpts.CUDAUsesLibDevice) + BackendArgs.push_back("-nvptx-enable-reflect"); for (unsigned i = 0, e = CodeGenOpts.BackendOptions.size(); i != e; ++i) BackendArgs.push_back(CodeGenOpts.BackendOptions[i].c_str()); BackendArgs.push_back(nullptr); Index: lib/CodeGen/CodeGenAction.cpp =================================================================== --- lib/CodeGen/CodeGenAction.cpp +++ lib/CodeGen/CodeGenAction.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IRReader/IRReader.h" @@ -34,6 +35,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/Timer.h" +#include "llvm/Transforms/IPO.h" #include using namespace clang; using namespace llvm; @@ -160,10 +162,32 @@ // Link LinkModule into this module if present, preserving its validity. if (LinkModule) { + std::vector ModuleFuncNames; + // We need to internalize contents of the linked module but it + // has to be done *after* the linking because internalized + // symbols will not be linked in otherwise. + // In order to do that, we preserve current list of function names in + // the module and then pass it to Internalize pass to preserve. + if (LangOpts.CUDA && LangOpts.CUDAIsDevice && + LangOpts.CUDAUsesLibDevice) + for (auto &F : *TheModule) + if (!F.isDeclaration()) + ModuleFuncNames.push_back(F.getName().data()); + if (Linker::LinkModules( M, LinkModule.get(), [=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); })) return; + if (LangOpts.CUDA && LangOpts.CUDAIsDevice && + LangOpts.CUDAUsesLibDevice) { + legacy::PassManager passes; + passes.add(createInternalizePass(ModuleFuncNames)); + // Considering that most of the functions we've linked are + // not going to be used, we may want to eliminate them + // early. + passes.add(createGlobalDCEPass()); + passes.run(*TheModule); + } } // Install an inline asm handler so that diagnostics get printed through Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -1392,6 +1392,9 @@ if (Args.hasArg(OPT_fcuda_is_device)) Opts.CUDAIsDevice = 1; + if (Args.hasArg(OPT_fcuda_uses_libdevice)) + Opts.CUDAUsesLibDevice = 1; + if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device)) Opts.CUDAAllowHostCallsFromHostDevice = 1; Index: test/CodeGenCUDA/Inputs/device-code.ll =================================================================== --- /dev/null +++ test/CodeGenCUDA/Inputs/device-code.ll @@ -0,0 +1,28 @@ +; Simple bit of IR to mimic CUDA's libdevice. We want to be +; able to link with it and we need to make sure all __nvvm_reflect +; calls are eliminated by the time PTX has been produced. + +target triple = "nvptx-unknown-cuda" + +declare i32 @__nvvm_reflect(i8*) + +@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" + +define float @_Z17device_mul_or_addff(float %a, float %b) { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %cmp = icmp ne i32 %reflect, 0 + br i1 %cmp, label %use_mul, label %use_add + +use_mul: + %ret1 = fmul float %a, %b + br label %exit + +use_add: + %ret2 = fadd float %a, %b + br label %exit + +exit: + %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] + + ret float %ret +} Index: test/CodeGenCUDA/link-device-bitcode.cu =================================================================== --- /dev/null +++ test/CodeGenCUDA/link-device-bitcode.cu @@ -0,0 +1,60 @@ +// Test for linking with CUDA's libdevice as outlined in +// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice +// +// REQUIRES: nvptx-registered-target +// +// Prepare bitcode file to link with +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \ +// RUN: %S/Inputs/device-code.ll +// +// Make sure function in device-code gets linked in and internalized. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN: -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \ +// RUN: -disable-llvm-passes -o - %s \ +// RUN: | FileCheck %s -check-prefix CHECK-IR +// +// Make sure function in device-code gets linked but is not internalized +// without -fcuda-uses-libdevice +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN: -mlink-bitcode-file %t.bc -emit-llvm \ +// RUN: -disable-llvm-passes -o - %s \ +// RUN: | FileCheck %s -check-prefix CHECK-IR-NLD +// +// NVVMReflect is a target-specific pass runs after -emit-llvm prints +// IR, so we need to check NVPTX to make sure that the pass did happen +// and __nvvm_reflect calls were eliminated. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN: -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o - %s \ +// RUN: | FileCheck %s -check-prefix CHECK-PTX + +#include "Inputs/cuda.h" + +__device__ float device_mul_or_add(float a, float b); + +// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf( +// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf( +__device__ void should_not_be_internalized(float *data) {} + +// Make sure kernel call has not been internalized. +// CHECK-IR-LABEL: define void @_Z6kernelPfS_ +// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_( +__global__ __attribute__((used)) void kernel(float *out, float *in) { + *out = device_mul_or_add(in[0], in[1]); + should_not_be_internalized(out); +} + +// Make sure device_mul_or_add() is present in IR, is internal and +// calls __nvvm_reflect(). +// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff( +// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff( +// CHECK-IR: call i32 @__nvvm_reflect +// CHECK-IR: ret float + +// By the time device_mul_or_add() makes it to PTX, __nvvm_reflect references +// should be gone. +// CHECK-PTX-NOT: .visible +// CHECK-PTX-LABEL: .func (.param .b32 func_retval0) _Z17device_mul_or_addff( +// CHECK-PTX-NOT: __nvvm_reflect +// CHECK-PTX-NOT: %reflect +// CHECK-PTX: add.rn.f32 +// CHECK-PTX: ret;