Index: lib/CMakeLists.txt
===================================================================
--- lib/CMakeLists.txt
+++ lib/CMakeLists.txt
@@ -106,6 +106,8 @@
     LLVMipo
     LLVMMC
     LLVMPasses
+    LLVMLinker
+    LLVMIRReader
     ${nvptx_libs}
     # The libraries below are required for darwin: http://PR26392
     LLVMBitReader
Index: lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- lib/CodeGen/PPCGCodeGeneration.cpp
+++ lib/CodeGen/PPCGCodeGeneration.cpp
@@ -31,6 +31,8 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
@@ -102,6 +104,11 @@
                               cl::Hidden, cl::init(false), cl::ZeroOrMore,
                               cl::cat(PollyCategory));
 
+static cl::opt<std::string> LibDevice(
+    "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden,
+    cl::init("/usr/local/cuda-7.5/nvvm/libdevice/libdevice.compute_20.10.bc"),
+    cl::ZeroOrMore, cl::cat(PollyCategory));
+
 static cl::opt<std::string>
     CudaVersion("polly-acc-cuda-version",
                 cl::desc("The CUDA version to compile for"), cl::Hidden,
@@ -598,6 +605,9 @@
   /// @param F The function to remove references to.
   void clearLoops(Function *F);
 
+  /// Link with the NVIDIA libdevice library (if needed and available).
+  void addLibDevice();
+
   /// Finalize the generation of the kernel function.
   ///
   /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
@@ -1300,13 +1310,32 @@
   return isl_bool_true;
 }
 
+/// A list of functions that are available in NVIDIA's libdevice.
+std::vector<std::string> LibDeviceFunctions = {"exp", "expf", "expl", "cos",
+                                               "cosf"};
+
+/// Return the corresponding CUDA libdevice function name for @p F.
+///
+/// Return "" if we are not compiling for CUDA.
+std::string getLibDeviceFuntion(Function *F) {
+  for (auto Name : LibDeviceFunctions)
+    if (Name == F->getName())
+      return "__nv_" + Name;
+
+  return "";
+}
+
 /// Check if F is a function that we can code-generate in a GPU kernel.
-static bool isValidFunctionInKernel(llvm::Function *F) {
+static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) {
   assert(F && "F is an invalid pointer");
   // We string compare against the name of the function to allow
   // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and
   // "llvm.copysign".
   const StringRef Name = F->getName();
+
+  if (AllowLibDevice && getLibDeviceFuntion(F).length() > 0)
+    return true;
+
   return F->isIntrinsic() &&
          (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
           Name.startswith("llvm.copysign"));
@@ -1322,14 +1351,16 @@
 
 /// Return `Function`s from `RawSubtreeValues`.
 static SetVector<Function *>
-getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) {
+getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues,
+                                 bool AllowLibDevice) {
   SetVector<Function *> SubtreeFunctions;
   for (Value *It : RawSubtreeValues) {
     Function *F = dyn_cast<Function>(It);
     if (F) {
-      assert(isValidFunctionInKernel(F) && "Code should have bailed out by "
-                                           "this point if an invalid function "
-                                           "were present in a kernel.");
+      assert(isValidFunctionInKernel(F, AllowLibDevice) &&
+             "Code should have bailed out by "
+             "this point if an invalid function "
+             "were present in a kernel.");
       SubtreeFunctions.insert(F);
     }
   }
@@ -1383,8 +1414,11 @@
       make_filter_range(SubtreeValues, isValidSubtreeValue);
   SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
                                         ValidSubtreeValuesIt.end());
+
+  bool AllowLibDevice = Arch == GPUArch::NVPTX64;
+
   SetVector<Function *> ValidSubtreeFunctions(
-      getFunctionsFromRawSubtreeValues(SubtreeValues));
+      getFunctionsFromRawSubtreeValues(SubtreeValues, AllowLibDevice));
 
   // @see IslNodeBuilder::getReferencesInSubtree
   SetVector<Value *> ReplacedValues;
@@ -2079,6 +2113,44 @@
   return ASMStream.str();
 }
 
+void GPUNodeBuilder::addLibDevice() {
+  if (Arch != GPUArch::NVPTX64)
+    return;
+
+  bool RequiresLibDevice = false;
+
+  for (Function &F : GPUModule->functions()) {
+    if (!F.isDeclaration())
+      continue;
+
+    std::string LibDeviceFunc = getLibDeviceFuntion(&F);
+    if (LibDeviceFunc.length() != 0) {
+      F.setName(LibDeviceFunc);
+      RequiresLibDevice = true;
+    }
+  }
+
+  if (RequiresLibDevice) {
+    SMDiagnostic Error;
+    auto LibDeviceModule =
+        parseIRFile(LibDevice, Error, GPUModule->getContext());
+
+    if (!LibDeviceModule) {
+      BuildSuccessful = false;
+      errs() << "Could not find libdevice. Skipping GPU kernel generation. "
+                "Please set -polly-acc-libdevice accordingly.\n";
+      return;
+    }
+
+    Linker L(*GPUModule);
+
+    // Set an nvptx64 target triple to avoid linker warnings. The original
+    // trible of the libdevice files are nvptx-unknown-unknown.
+    LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+    L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded);
+  }
+}
+
 std::string GPUNodeBuilder::finalizeKernelFunction() {
 
   if (verifyModule(*GPUModule)) {
@@ -2092,6 +2164,8 @@
     return "";
   }
 
+  addLibDevice();
+
   if (DumpKernelIR)
     outs() << *GPUModule << "\n";
 
@@ -2943,10 +3017,12 @@
   ///
   /// If this basic block does something with a `Function` other than calling
   /// a function that we support in a kernel, return true.
-  bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) {
+  bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB,
+                                             bool AllowLibDevice) {
     for (const Instruction &Inst : *BB) {
       const CallInst *Call = dyn_cast<CallInst>(&Inst);
-      if (Call && isValidFunctionInKernel(Call->getCalledFunction())) {
+      if (Call &&
+          isValidFunctionInKernel(Call->getCalledFunction(), AllowLibDevice)) {
         continue;
       }
 
@@ -2962,16 +3038,17 @@
   }
 
   /// Return whether the Scop S uses functions in a way that we do not support.
-  bool containsInvalidKernelFunction(const Scop &S) {
+  bool containsInvalidKernelFunction(const Scop &S, bool AllowLibDevice) {
     for (auto &Stmt : S) {
       if (Stmt.isBlockStmt()) {
-        if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock()))
+        if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock(),
+                                                  AllowLibDevice))
           return true;
       } else {
         assert(Stmt.isRegionStmt() &&
                "Stmt was neither block nor region statement");
         for (const BasicBlock *BB : Stmt.getRegion()->blocks())
-          if (containsInvalidKernelFunctionInBllock(BB))
+          if (containsInvalidKernelFunctionInBllock(BB, AllowLibDevice))
             return true;
       }
     }
@@ -3056,7 +3133,8 @@
     // kernel. This may lead to a kernel trying to call a function on the host.
     // This also allows us to prevent codegen from trying to take the
     // address of an intrinsic function to send to the kernel.
-    if (containsInvalidKernelFunction(CurrentScop)) {
+    if (containsInvalidKernelFunction(CurrentScop,
+                                      Architecture == GPUArch::NVPTX64)) {
       DEBUG(
           dbgs()
               << "Scop contains function which cannot be materialised in a GPU "
Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll
===================================================================
--- /dev/null
+++ test/GPGPU/libdevice-functions-copied-into-kernel.ll
@@ -0,0 +1,74 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s \
+; RUN: -polly-acc-libdevice=%S/libdevice-functions-copied-into-kernel_libdevice.bc \
+; RUN:     | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -polly-acc-libdevice=%S/libdevice-functions-copied-into-kernel_libdevice.bc \
+; RUN:     < %s | FileCheck %s --check-prefix=KERNEL-IR
+; RUN: opt %loadPolly -S -polly-codegen-ppcg  < %s \
+; RUN: -polly-acc-libdevice=%S/libdevice-functions-copied-into-kernel_libdevice.bc \
+; RUN:     | FileCheck %s --check-prefix=HOST-IR
+
+; Test that we do recognise and codegen a kernel that has functions that can
+; be mapped to NVIDIA's libdevice
+
+; REQUIRES: pollyacc
+
+; Check that we model the kernel as a scop.
+; SCOP:      Function: f
+; SCOP-NEXT:       Region: %entry.split---%for.end
+
+; Check that the intrinsic call is present in the kernel IR.
+; KERNEL-IR:   %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_)
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+
+; void f(float *A, float *B, int N) {
+;   for(int i = 0; i < N; i++) {
+;       float tmp0 = A[i];
+;       float tmp1 = expf(tmp1);
+;       B[i] = tmp1;
+;   }
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(float* %A, float* %B, i32 %N) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %A.arr.i.val = load float, float* %A.arr.i, align 4
+  ; Call to intrinsics that should be part of the kernel.
+  %expf = tail call float @expf(float %A.arr.i.val)
+  %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %expf, float* %B.arr.i, align 4
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %wide.trip.count = zext i32 %N to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @expf(float) #0
+
+attributes #0 = { nounwind readnone }
+
Index: test/GPGPU/libdevice-functions-copied-into-kernel_libdevice.bc
===================================================================
--- /dev/null
+++ test/GPGPU/libdevice-functions-copied-into-kernel_libdevice.bc
@@ -0,0 +1,6 @@
+definelayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define float @__nv_expf(float %a) #1 {
+  return %a
+}