Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1383,15 +1383,36 @@ /// A list of functions that are available in NVIDIA's libdevice. const std::set CUDALibDeviceFunctions = { - "exp", "expf", "expl", "cos", "cosf", "sqrt", - "sqrtf", "copysign", "copysignf", "copysignl", "log", "logf"}; + "exp", "expf", "expl", "cos", "cosf", "sqrt", "sqrtf", + "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"}; + +// A map from intrinsics to their corresponding libdevice functions. +const std::map IntrinsicToLibdeviceFunc = { + {"llvm.exp.f64", "exp"}, + {"llvm.exp.f32", "expf"}, + {"llvm.powi.f64", "powi"}, + {"llvm.powi.f32", "powif"}}; /// Return the corresponding CUDA libdevice function name for @p F. +/// Note that this function will try to convert instrinsics in the list +/// IntrinsicToLibdeviceFunc into libdevice functions. +/// This is because some intrinsics such as `exp` +/// are not supported by the NVPTX backend. +/// If this restriction of the backend is lifted, we should refactor our code +/// so that we use intrinsics whenever possible. /// /// Return "" if we are not compiling for CUDA. std::string getCUDALibDeviceFuntion(Function *F) { - if (CUDALibDeviceFunctions.count(F->getName())) - return std::string("__nv_") + std::string(F->getName()); + const std::string FnName = [&] { + auto It = IntrinsicToLibdeviceFunc.find(F->getName()); + if (It != IntrinsicToLibdeviceFunc.end()) + return It->second; + + return std::string(F->getName()); + }(); + + if (CUDALibDeviceFunctions.count(FnName)) + return "__nv_" + FnName; return ""; } @@ -1409,7 +1430,7 @@ return F->isIntrinsic() && (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || - Name.startswith("llvm.copysign") || Name.startswith("llvm.powi")); + Name.startswith("llvm.copysign")); } /// Do not take `Function` as a subtree value. @@ -2362,9 +2383,22 @@ if (!F.isDeclaration()) continue; - std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F); + const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F); if (CUDALibDeviceFunc.length() != 0) { - F.setName(CUDALibDeviceFunc); + // We need to handle the case where a module looks like this: + // @expf(..) + // @llvm.exp.f64(..) + // Both of these functions would be renamed to `__nv_expf`. + // + // So, we must first check for the existence of the libdevice function. + // If this exists, we replace our current function with it. + // + // If it does not exist, we rename the current function to the + // libdevice functiono name. + if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc)) + F.replaceAllUsesWith(Replacement); + else + F.setName(CUDALibDeviceFunc); RequiresLibDevice = true; } } Index: test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- test/GPGPU/intrinsic-copied-into-kernel.ll +++ test/GPGPU/intrinsic-copied-into-kernel.ll @@ -14,7 +14,7 @@ ; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) ; KERNEL-IR: declare float @llvm.sqrt.f32(float) ; KERNEL-IR: declare float @llvm.fabs.f32(float) -; KERNEL-IR: declare float @llvm.powi.f32(float, i32) + ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. @@ -27,7 +27,6 @@ ; float tmp1 = sqrt(tmp1); ; float tmp2 = fabs(tmp2); ; float tmp3 = copysignf(tmp1, tmp2); -; float tmp4 = powi(tmp3, 2); ; B[i] = tmp4; ; } ; } @@ -53,9 +52,8 @@ %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) %fabs = tail call float @llvm.fabs.f32(float %sqrt); %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs); - %powi = tail call float @llvm.powi.f32(float %copysign, i32 2); %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %powi, float* %B.arr.i, align 4 + store float %copysign, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -73,7 +71,6 @@ declare float @llvm.sqrt.f32(float) #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.copysign.f32(float, float) #0 -declare float @llvm.powi.f32(float, i32) #0 attributes #0 = { nounwind readnone } Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll =================================================================== --- test/GPGPU/libdevice-functions-copied-into-kernel.ll +++ test/GPGPU/libdevice-functions-copied-into-kernel.ll @@ -22,6 +22,11 @@ ; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf) ; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf) +; Powi and exp cannot be lowered directly. Rather, we expect them to be +; lowered by libdevice. +; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2) +; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi) + ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. ; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) @@ -33,6 +38,8 @@ ; float expf = expf(tmp1); ; cosf = cosf(expf); ; logf = logf(cosf); +; powi = powi(logf, 2); +; exp = exp(powi); ; B[i] = logf; ; } ; } @@ -58,8 +65,10 @@ %expf = tail call float @expf(float %A.arr.i.val) %cosf = tail call float @cosf(float %expf) %logf = tail call float @logf(float %cosf) + %powi = tail call float @llvm.powi.f32(float %logf, i32 2) + %exp = tail call float @llvm.exp.f32(float %powi) %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %logf, float* %B.arr.i, align 4 + store float %exp, float* %B.arr.i, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %wide.trip.count = zext i32 %N to i64 @@ -77,6 +86,8 @@ declare float @expf(float) #0 declare float @cosf(float) #0 declare float @logf(float) #0 +declare float @llvm.powi.f32(float, i32) #0 +declare float @llvm.exp.f32(float) #0 attributes #0 = { nounwind readnone }