[PPCGCodeGen] Convert intrinsics to libdevice functions whenever possible.

bollu · bollu · commit 56572c6a5e47 · 2017-08-31T13:03:37.000Z
This is useful when we face certain intrinsics such as `llvm.exp.*` which cannot be lowered by the NVPTX backend while other intrinsics can. So, we would need to keep blacklists of intrinsics that cannot be handled by the NVPTX backend. It is much simpler to try and promote all intrinsics to libdevice versions. This patch makes function/intrinsic very uniform, and will always try to use a libdevice version if it exists. Differential Revision: https://reviews.llvm.org/D37056 llvm-svn: 312239
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -1383,15 +1383,36 @@ isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
 
 /// A list of functions that are available in NVIDIA's libdevice.
 const std::set<std::string> CUDALibDeviceFunctions = {
-    "exp",   "expf",     "expl",      "cos",       "cosf", "sqrt",
-    "sqrtf", "copysign", "copysignf", "copysignl", "log",  "logf"};
+    "exp",      "expf",      "expl",      "cos", "cosf", "sqrt", "sqrtf",
+    "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"};
+
+// A map from intrinsics to their corresponding libdevice functions.
+const std::map<std::string, std::string> IntrinsicToLibdeviceFunc = {
+    {"llvm.exp.f64", "exp"},
+    {"llvm.exp.f32", "expf"},
+    {"llvm.powi.f64", "powi"},
+    {"llvm.powi.f32", "powif"}};
 
 /// Return the corresponding CUDA libdevice function name for @p F.
+/// Note that this function will try to convert instrinsics in the list
+/// IntrinsicToLibdeviceFunc into libdevice functions.
+/// This is because some intrinsics such as `exp`
+/// are not supported by the NVPTX backend.
+/// If this restriction of the backend is lifted, we should refactor our code
+/// so that we use intrinsics whenever possible.
 ///
 /// Return "" if we are not compiling for CUDA.
 std::string getCUDALibDeviceFuntion(Function *F) {
-  if (CUDALibDeviceFunctions.count(F->getName()))
-    return std::string("__nv_") + std::string(F->getName());
+  const std::string FnName = [&] {
+    auto It = IntrinsicToLibdeviceFunc.find(F->getName());
+    if (It != IntrinsicToLibdeviceFunc.end())
+      return It->second;
+
+    return std::string(F->getName());
+  }();
+
+  if (CUDALibDeviceFunctions.count(FnName))
+    return "__nv_" + FnName;
 
   return "";
 }
@@ -1409,7 +1430,7 @@ static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) {
 
   return F->isIntrinsic() &&
          (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
-          Name.startswith("llvm.copysign") || Name.startswith("llvm.powi"));
+          Name.startswith("llvm.copysign"));
 }
 
 /// Do not take `Function` as a subtree value.
@@ -2362,9 +2383,22 @@ bool GPUNodeBuilder::requiresCUDALibDevice() {
     if (!F.isDeclaration())
       continue;
 
-    std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F);
+    const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F);
     if (CUDALibDeviceFunc.length() != 0) {
-      F.setName(CUDALibDeviceFunc);
+      // We need to handle the case where a module looks like this:
+      // @expf(..)
+      // @llvm.exp.f64(..)
+      // Both of these functions would be renamed to `__nv_expf`.
+      //
+      // So, we must first check for the existence of the libdevice function.
+      // If this exists, we replace our current function with it.
+      //
+      // If it does not exist, we rename the current function to the
+      // libdevice functiono name.
+      if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc))
+        F.replaceAllUsesWith(Replacement);
+      else
+        F.setName(CUDALibDeviceFunc);
       RequiresLibDevice = true;
     }
   }
diff --git a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
@@ -14,7 +14,7 @@
 ; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
 ; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
 ; KERNEL-IR:   declare float @llvm.fabs.f32(float)
-; KERNEL-IR:   declare float @llvm.powi.f32(float, i32)
+
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
@@ -27,7 +27,6 @@
 ;       float tmp1 = sqrt(tmp1);
 ;       float tmp2 = fabs(tmp2);
 ;       float tmp3 = copysignf(tmp1, tmp2);
-;       float tmp4 = powi(tmp3, 2);
 ;       B[i] = tmp4;
 ;   }
 ; }
@@ -53,9 +52,8 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
   %fabs = tail call float @llvm.fabs.f32(float %sqrt);
   %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
-  %powi = tail call float @llvm.powi.f32(float %copysign, i32 2);
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %powi, float* %B.arr.i, align 4
+  store float %copysign, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -73,7 +71,6 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 declare float @llvm.sqrt.f32(float) #0
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.copysign.f32(float, float) #0
-declare float @llvm.powi.f32(float, i32) #0
 
 attributes #0 = { nounwind readnone }
 
diff --git a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll b/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll
@@ -22,6 +22,11 @@
 ; KERNEL-IR:   %p_cosf = tail call float @__nv_cosf(float %p_expf)
 ; KERNEL-IR:   %p_logf = tail call float @__nv_logf(float %p_cosf)
 
+; Powi and exp cannot be lowered directly. Rather, we expect them to be
+; lowered by libdevice.
+; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2)
+; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi)
+
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
 ; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
@@ -33,6 +38,8 @@
 ;       float expf  = expf(tmp1);
 ;       cosf = cosf(expf);
 ;       logf = logf(cosf);
+;       powi = powi(logf, 2);
+;       exp = exp(powi);
 ;       B[i] = logf;
 ;   }
 ; }
@@ -58,8 +65,10 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %expf = tail call float @expf(float %A.arr.i.val)
   %cosf = tail call float @cosf(float %expf)
   %logf = tail call float @logf(float %cosf)
+  %powi = tail call float @llvm.powi.f32(float %logf, i32 2)
+  %exp = tail call float @llvm.exp.f32(float %powi)
   %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %logf, float* %B.arr.i, align 4
+  store float %exp, float* %B.arr.i, align 4
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %wide.trip.count = zext i32 %N to i64
@@ -77,6 +86,8 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 declare float @expf(float) #0
 declare float @cosf(float) #0
 declare float @logf(float) #0
+declare float @llvm.powi.f32(float, i32) #0
+declare float @llvm.exp.f32(float) #0
 
 attributes #0 = { nounwind readnone }