Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1383,15 +1383,47 @@ /// A list of functions that are available in NVIDIA's libdevice. const std::set CUDALibDeviceFunctions = { - "exp", "expf", "expl", "cos", "cosf", "sqrt", - "sqrtf", "copysign", "copysignf", "copysignl", "log", "logf"}; + "exp", "expf", "expl", "cos", "cosf", "sqrt", "sqrtf", + "copysign", "copysignf", "copysignl", "log", "logf", "powi"}; + +/// A list of intrinsics that are unsupported by the NVPTX backend. +const std::set NVPTXUnsupportedIntrinsics = {"exp", "powi"}; + +/// Return from the full "llvm.." name. +/// +/// Return "" if function is not an intrinsic. +std::string getStrippedIntrinsicName(const Function *F) { + assert(F && "invalid function pointer"); + const StringRef FnName = F->getName(); + + if (F->isIntrinsic() && FnName.startswith("llvm.")) { + const size_t BeginSeparator = FnName.find("."); + const size_t EndSeparator = FnName.rfind("."); + return std::string(FnName.slice(BeginSeparator + 1, EndSeparator)); + } + return ""; +} /// Return the corresponding CUDA libdevice function name for @p F. +/// Note that this function will try to convert instrinsics in the list +/// NVPTXUnsupportedIntrinsics into libdevice functions. +/// This is because some intrinsics such as `exp` +/// are not supported by the NVPTX backend. +/// If this restriction of the backend is lifted, we should refactor our code +/// so that we use intrinsics whenever possible. /// /// Return "" if we are not compiling for CUDA. std::string getCUDALibDeviceFuntion(Function *F) { - if (CUDALibDeviceFunctions.count(F->getName())) - return std::string("__nv_") + std::string(F->getName()); + const std::string FnName = [&] { + const std::string IntrinsicName = getStrippedIntrinsicName(F); + if (NVPTXUnsupportedIntrinsics.count(IntrinsicName)) + return IntrinsicName; + + return std::string(F->getName()); + }(); + + if (CUDALibDeviceFunctions.count(FnName)) + return "__nv_" + FnName; return ""; } @@ -1409,7 +1441,7 @@ return F->isIntrinsic() && (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || - Name.startswith("llvm.copysign") || Name.startswith("llvm.powi")); + Name.startswith("llvm.copysign")); } /// Do not take `Function` as a subtree value. Index: test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- test/GPGPU/intrinsic-copied-into-kernel.ll +++ test/GPGPU/intrinsic-copied-into-kernel.ll @@ -14,7 +14,7 @@ ; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) ; KERNEL-IR: declare float @llvm.sqrt.f32(float) ; KERNEL-IR: declare float @llvm.fabs.f32(float) -; KERNEL-IR: declare float @llvm.powi.f32(float, i32) + ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. @@ -27,7 +27,6 @@ ; float tmp1 = sqrt(tmp1); ; float tmp2 = fabs(tmp2); ; float tmp3 = copysignf(tmp1, tmp2); -; float tmp4 = powi(tmp3, 2); ; B[i] = tmp4; ; } ; } @@ -53,9 +52,8 @@ %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) %fabs = tail call float @llvm.fabs.f32(float %sqrt); %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs); - %powi = tail call float @llvm.powi.f32(float %copysign, i32 2); %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %powi, float* %B.arr.i, align 4 + store float %copysign, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -73,7 +71,6 @@ declare float @llvm.sqrt.f32(float) #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.copysign.f32(float, float) #0 -declare float @llvm.powi.f32(float, i32) #0 attributes #0 = { nounwind readnone } Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll =================================================================== --- test/GPGPU/libdevice-functions-copied-into-kernel.ll +++ test/GPGPU/libdevice-functions-copied-into-kernel.ll @@ -22,6 +22,11 @@ ; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf) ; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf) +; Powi and exp cannot be lowered directly. Rather, we expect them to be +; lowered by libdevice. +; KERNEL-IR: %p_powi = tail call float @__nv_powi(float %p_logf, i32 2) +; KERNEL-IR: %p_exp = tail call float @__nv_exp(float %p_powi) + ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. ; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) @@ -33,6 +38,8 @@ ; float expf = expf(tmp1); ; cosf = cosf(expf); ; logf = logf(cosf); +; powi = powi(logf, 2); +; exp = exp(powi); ; B[i] = logf; ; } ; } @@ -58,8 +65,10 @@ %expf = tail call float @expf(float %A.arr.i.val) %cosf = tail call float @cosf(float %expf) %logf = tail call float @logf(float %cosf) + %powi = tail call float @llvm.powi.f32(float %logf, i32 2) + %exp = tail call float @llvm.exp.f32(float %powi) %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %logf, float* %B.arr.i, align 4 + store float %exp, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -77,6 +86,8 @@ declare float @expf(float) #0 declare float @cosf(float) #0 declare float @logf(float) #0 +declare float @llvm.powi.f32(float, i32) #0 +declare float @llvm.exp.f32(float) #0 attributes #0 = { nounwind readnone }