Index: lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- lib/CodeGen/PPCGCodeGeneration.cpp
+++ lib/CodeGen/PPCGCodeGeneration.cpp
@@ -1383,15 +1383,47 @@
 
 /// A list of functions that are available in NVIDIA's libdevice.
 const std::set<std::string> CUDALibDeviceFunctions = {
-    "exp",   "expf",     "expl",      "cos",       "cosf", "sqrt",
-    "sqrtf", "copysign", "copysignf", "copysignl", "log",  "logf"};
+    "exp",      "expf",      "expl",      "cos", "cosf", "sqrt", "sqrtf",
+    "copysign", "copysignf", "copysignl", "log", "logf", "powi"};
+
+/// A list of intrinsics that are unsupported by the NVPTX backend.
+const std::set<std::string> NVPTXUnsupportedIntrinsics = {"exp", "powi"};
+
+/// Return <intrinsicname> from the full  "llvm.<intrinsicname>.<ty>" name.
+///
+/// Return "" if function is not an intrinsic.
+std::string getStrippedIntrinsicName(const Function *F) {
+  assert(F && "invalid function pointer");
+  const StringRef FnName = F->getName();
+
+  if (F->isIntrinsic() && FnName.startswith("llvm.")) {
+    const size_t BeginSeparator = FnName.find(".");
+    const size_t EndSeparator = FnName.rfind(".");
+    return std::string(FnName.slice(BeginSeparator + 1, EndSeparator));
+  }
+  return "";
+}
 
 /// Return the corresponding CUDA libdevice function name for @p F.
+/// Note that this function will try to convert instrinsics in the list
+/// NVPTXUnsupportedIntrinsics into libdevice functions.
+/// This is because some intrinsics such as `exp`
+/// are not supported by the NVPTX backend.
+/// If this restriction of the backend is lifted, we should refactor our code
+/// so that we use intrinsics whenever possible.
 ///
 /// Return "" if we are not compiling for CUDA.
 std::string getCUDALibDeviceFuntion(Function *F) {
-  if (CUDALibDeviceFunctions.count(F->getName()))
-    return std::string("__nv_") + std::string(F->getName());
+  const std::string FnName = [&] {
+    const std::string IntrinsicName = getStrippedIntrinsicName(F);
+    if (NVPTXUnsupportedIntrinsics.count(IntrinsicName))
+      return IntrinsicName;
+
+    return std::string(F->getName());
+  }();
+
+  if (CUDALibDeviceFunctions.count(FnName))
+    return "__nv_" + FnName;
 
   return "";
 }
@@ -1409,7 +1441,7 @@
 
   return F->isIntrinsic() &&
          (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
-          Name.startswith("llvm.copysign") || Name.startswith("llvm.powi"));
+          Name.startswith("llvm.copysign"));
 }
 
 /// Do not take `Function` as a subtree value.
Index: test/GPGPU/intrinsic-copied-into-kernel.ll
===================================================================
--- test/GPGPU/intrinsic-copied-into-kernel.ll
+++ test/GPGPU/intrinsic-copied-into-kernel.ll
@@ -14,7 +14,7 @@
 ; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
 ; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
 ; KERNEL-IR:   declare float @llvm.fabs.f32(float)
-; KERNEL-IR:   declare float @llvm.powi.f32(float, i32)
+
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
@@ -27,7 +27,6 @@
 ;       float tmp1 = sqrt(tmp1);
 ;       float tmp2 = fabs(tmp2);
 ;       float tmp3 = copysignf(tmp1, tmp2);
-;       float tmp4 = powi(tmp3, 2);
 ;       B[i] = tmp4;
 ;   }
 ; }
@@ -53,9 +52,8 @@
   %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
   %fabs = tail call float @llvm.fabs.f32(float %sqrt);
   %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
-  %powi = tail call float @llvm.powi.f32(float %copysign, i32 2);
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %powi, float* %B.arr.i, align 4
+  store float %copysign, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -73,7 +71,6 @@
 declare float @llvm.sqrt.f32(float) #0
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.copysign.f32(float, float) #0
-declare float @llvm.powi.f32(float, i32) #0
 
 attributes #0 = { nounwind readnone }
 
Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll
===================================================================
--- test/GPGPU/libdevice-functions-copied-into-kernel.ll
+++ test/GPGPU/libdevice-functions-copied-into-kernel.ll
@@ -22,6 +22,11 @@
 ; KERNEL-IR:   %p_cosf = tail call float @__nv_cosf(float %p_expf)
 ; KERNEL-IR:   %p_logf = tail call float @__nv_logf(float %p_cosf)
 
+; Powi and exp cannot be lowered directly. Rather, we expect them to be
+; lowered by libdevice.
+; KERNEL-IR: %p_powi = tail call float @__nv_powi(float %p_logf, i32 2)
+; KERNEL-IR: %p_exp = tail call float @__nv_exp(float %p_powi)
+
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
 ; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
@@ -33,6 +38,8 @@
 ;       float expf  = expf(tmp1);
 ;       cosf = cosf(expf);
 ;       logf = logf(cosf);
+;       powi = powi(logf, 2);
+;       exp = exp(powi);
 ;       B[i] = logf;
 ;   }
 ; }
@@ -58,8 +65,10 @@
   %expf = tail call float @expf(float %A.arr.i.val)
   %cosf = tail call float @cosf(float %expf)
   %logf = tail call float @logf(float %cosf)
+  %powi = tail call float @llvm.powi.f32(float %logf, i32 2)
+  %exp = tail call float @llvm.exp.f32(float %powi)
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %logf, float* %B.arr.i, align 4
+  store float %exp, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -77,6 +86,8 @@
 declare float @expf(float) #0
 declare float @cosf(float) #0
 declare float @logf(float) #0
+declare float @llvm.powi.f32(float, i32) #0
+declare float @llvm.exp.f32(float) #0
 
 attributes #0 = { nounwind readnone }