Index: include/clang/Basic/LangOptions.def
===================================================================
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -165,6 +165,7 @@
 LANGOPT(CUDAIsDevice      , 1, 0, "Compiling for CUDA device")
 LANGOPT(CUDAAllowHostCallsFromHostDevice, 1, 0, "Allow host device functions to call host functions")
 LANGOPT(CUDADisableTargetCallChecks, 1, 0, "Disable checks for call targets (host, device, etc.)")
+LANGOPT(CUDAUsesLibDevice , 1, 0, "Apply Internalize and NVVMReflect passes to linked bitcode.")
 
 LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators")
 LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")
Index: include/clang/Driver/CC1Options.td
===================================================================
--- include/clang/Driver/CC1Options.td
+++ include/clang/Driver/CC1Options.td
@@ -651,6 +651,8 @@
   HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">;
 def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
   HelpText<"Incorporate CUDA device-side binary into host object file.">;
+def fcuda_uses_libdevice : Flag<["-"], "fcuda-uses-libdevice">,
+  HelpText<"Apply Internalize and NVVMReflect passes to linked bitcode.">;
 
 } // let Flags = [CC1Option]
 
Index: lib/CodeGen/BackendUtil.cpp
===================================================================
--- lib/CodeGen/BackendUtil.cpp
+++ lib/CodeGen/BackendUtil.cpp
@@ -458,6 +458,8 @@
     BackendArgs.push_back("-limit-float-precision");
     BackendArgs.push_back(CodeGenOpts.LimitFloatPrecision.c_str());
   }
+  if (LangOpts.CUDA && LangOpts.CUDAIsDevice && LangOpts.CUDAUsesLibDevice)
+    BackendArgs.push_back("-nvptx-enable-reflect");
   for (unsigned i = 0, e = CodeGenOpts.BackendOptions.size(); i != e; ++i)
     BackendArgs.push_back(CodeGenOpts.BackendOptions[i].c_str());
   BackendArgs.push_back(nullptr);
Index: lib/CodeGen/CodeGenAction.cpp
===================================================================
--- lib/CodeGen/CodeGenAction.cpp
+++ lib/CodeGen/CodeGenAction.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
@@ -34,6 +35,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Transforms/IPO.h"
 #include <memory>
 using namespace clang;
 using namespace llvm;
@@ -160,10 +162,32 @@
 
       // Link LinkModule into this module if present, preserving its validity.
       if (LinkModule) {
+        std::vector<const char *> ModuleFuncNames;
+        // We need to internalize contents of the linked module but it
+        // has to be done *after* the linking because internalized
+        // symbols will not be linked in otherwise.
+        // In order to do that, we preserve current list of function names in
+        // the module and then pass it to Internalize pass to preserve.
+        if (LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+            LangOpts.CUDAUsesLibDevice)
+          for (auto &F : *TheModule)
+            if (!F.isDeclaration())
+              ModuleFuncNames.push_back(F.getName().data());
+
         if (Linker::LinkModules(
                 M, LinkModule.get(),
                 [=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); }))
           return;
+        if (LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+            LangOpts.CUDAUsesLibDevice) {
+          legacy::PassManager passes;
+          passes.add(createInternalizePass(ModuleFuncNames));
+          // Considering that most of the functions we've linked are
+          // not going to be used, we may want to eliminate them
+          // early.
+          passes.add(createGlobalDCEPass());
+          passes.run(*TheModule);
+        }
       }
 
       // Install an inline asm handler so that diagnostics get printed through
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -1392,6 +1392,9 @@
   if (Args.hasArg(OPT_fcuda_is_device))
     Opts.CUDAIsDevice = 1;
 
+  if (Args.hasArg(OPT_fcuda_uses_libdevice))
+    Opts.CUDAUsesLibDevice = 1;
+
   if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device))
     Opts.CUDAAllowHostCallsFromHostDevice = 1;
 
Index: test/CodeGenCUDA/Inputs/device-code.ll
===================================================================
--- /dev/null
+++ test/CodeGenCUDA/Inputs/device-code.ll
@@ -0,0 +1,28 @@
+; Simple bit of IR to mimic CUDA's libdevice. We want to be
+; able to link with it and we need to make sure all __nvvm_reflect
+; calls are eliminated by the time PTX has been produced.
+
+target triple = "nvptx-unknown-cuda"
+
+declare i32 @__nvvm_reflect(i8*)
+
+@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00"
+
+define float @_Z17device_mul_or_addff(float %a, float %b) {
+  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+  %cmp = icmp ne i32 %reflect, 0
+  br i1 %cmp, label %use_mul, label %use_add
+
+use_mul:
+  %ret1 = fmul float %a, %b
+  br label %exit
+
+use_add:
+  %ret2 = fadd float %a, %b
+  br label %exit
+
+exit:
+  %ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
+
+  ret float %ret
+}
Index: test/CodeGenCUDA/link-device-bitcode.cu
===================================================================
--- /dev/null
+++ test/CodeGenCUDA/link-device-bitcode.cu
@@ -0,0 +1,60 @@
+// Test for linking with CUDA's libdevice as outlined in
+// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice
+//
+// REQUIRES: nvptx-registered-target
+//
+// Prepare bitcode file to link with
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \
+// RUN:    %S/Inputs/device-code.ll
+//
+// Make sure function in device-code gets linked in and internalized.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \
+// RUN:    -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR
+//
+// Make sure function in device-code gets linked but is not internalized
+// without -fcuda-uses-libdevice
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-bitcode-file %t.bc -emit-llvm \
+// RUN:    -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR-NLD
+//
+// NVVMReflect is a target-specific pass runs after -emit-llvm prints
+// IR, so we need to check NVPTX to make sure that the pass did happen
+// and __nvvm_reflect calls were eliminated.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-PTX
+
+#include "Inputs/cuda.h"
+
+__device__ float device_mul_or_add(float a, float b);
+
+// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf(
+// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf(
+__device__ void should_not_be_internalized(float *data) {}
+
+// Make sure kernel call has not been internalized.
+// CHECK-IR-LABEL: define void @_Z6kernelPfS_
+// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_(
+__global__ __attribute__((used)) void kernel(float *out, float *in) {
+  *out = device_mul_or_add(in[0], in[1]);
+  should_not_be_internalized(out);
+}
+
+// Make sure device_mul_or_add() is present in IR, is internal and
+// calls __nvvm_reflect().
+// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff(
+// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff(
+// CHECK-IR: call i32 @__nvvm_reflect
+// CHECK-IR: ret float
+
+// By the time device_mul_or_add() makes it to PTX, __nvvm_reflect references
+// should be gone.
+// CHECK-PTX-NOT: .visible
+// CHECK-PTX-LABEL: .func  (.param .b32 func_retval0) _Z17device_mul_or_addff(
+// CHECK-PTX-NOT: __nvvm_reflect
+// CHECK-PTX-NOT: %reflect
+// CHECK-PTX: add.rn.f32
+// CHECK-PTX: ret;