Skip to content

Commit 56572c6

Browse files
committedAug 31, 2017
[PPCGCodeGen] Convert intrinsics to libdevice functions whenever possible.
This is useful when we face certain intrinsics such as `llvm.exp.*` which cannot be lowered by the NVPTX backend while other intrinsics can. So, we would need to keep blacklists of intrinsics that cannot be handled by the NVPTX backend. It is much simpler to try and promote all intrinsics to libdevice versions. This patch makes function/intrinsic very uniform, and will always try to use a libdevice version if it exists. Differential Revision: https://reviews.llvm.org/D37056 llvm-svn: 312239
1 parent 80df642 commit 56572c6

File tree

3 files changed

+55
-13
lines changed

3 files changed

+55
-13
lines changed
 

‎polly/lib/CodeGen/PPCGCodeGeneration.cpp

+41-7
Original file line numberDiff line numberDiff line change
@@ -1383,15 +1383,36 @@ isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
13831383

13841384
/// A list of functions that are available in NVIDIA's libdevice.
13851385
const std::set<std::string> CUDALibDeviceFunctions = {
1386-
"exp", "expf", "expl", "cos", "cosf", "sqrt",
1387-
"sqrtf", "copysign", "copysignf", "copysignl", "log", "logf"};
1386+
"exp", "expf", "expl", "cos", "cosf", "sqrt", "sqrtf",
1387+
"copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"};
1388+
1389+
// A map from intrinsics to their corresponding libdevice functions.
1390+
const std::map<std::string, std::string> IntrinsicToLibdeviceFunc = {
1391+
{"llvm.exp.f64", "exp"},
1392+
{"llvm.exp.f32", "expf"},
1393+
{"llvm.powi.f64", "powi"},
1394+
{"llvm.powi.f32", "powif"}};
13881395

13891396
/// Return the corresponding CUDA libdevice function name for @p F.
1397+
/// Note that this function will try to convert instrinsics in the list
1398+
/// IntrinsicToLibdeviceFunc into libdevice functions.
1399+
/// This is because some intrinsics such as `exp`
1400+
/// are not supported by the NVPTX backend.
1401+
/// If this restriction of the backend is lifted, we should refactor our code
1402+
/// so that we use intrinsics whenever possible.
13901403
///
13911404
/// Return "" if we are not compiling for CUDA.
13921405
std::string getCUDALibDeviceFuntion(Function *F) {
1393-
if (CUDALibDeviceFunctions.count(F->getName()))
1394-
return std::string("__nv_") + std::string(F->getName());
1406+
const std::string FnName = [&] {
1407+
auto It = IntrinsicToLibdeviceFunc.find(F->getName());
1408+
if (It != IntrinsicToLibdeviceFunc.end())
1409+
return It->second;
1410+
1411+
return std::string(F->getName());
1412+
}();
1413+
1414+
if (CUDALibDeviceFunctions.count(FnName))
1415+
return "__nv_" + FnName;
13951416

13961417
return "";
13971418
}
@@ -1409,7 +1430,7 @@ static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) {
14091430

14101431
return F->isIntrinsic() &&
14111432
(Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
1412-
Name.startswith("llvm.copysign") || Name.startswith("llvm.powi"));
1433+
Name.startswith("llvm.copysign"));
14131434
}
14141435

14151436
/// Do not take `Function` as a subtree value.
@@ -2362,9 +2383,22 @@ bool GPUNodeBuilder::requiresCUDALibDevice() {
23622383
if (!F.isDeclaration())
23632384
continue;
23642385

2365-
std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F);
2386+
const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F);
23662387
if (CUDALibDeviceFunc.length() != 0) {
2367-
F.setName(CUDALibDeviceFunc);
2388+
// We need to handle the case where a module looks like this:
2389+
// @expf(..)
2390+
// @llvm.exp.f64(..)
2391+
// Both of these functions would be renamed to `__nv_expf`.
2392+
//
2393+
// So, we must first check for the existence of the libdevice function.
2394+
// If this exists, we replace our current function with it.
2395+
//
2396+
// If it does not exist, we rename the current function to the
2397+
// libdevice functiono name.
2398+
if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc))
2399+
F.replaceAllUsesWith(Replacement);
2400+
else
2401+
F.setName(CUDALibDeviceFunc);
23682402
RequiresLibDevice = true;
23692403
}
23702404
}

‎polly/test/GPGPU/intrinsic-copied-into-kernel.ll

+2-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
1515
; KERNEL-IR: declare float @llvm.sqrt.f32(float)
1616
; KERNEL-IR: declare float @llvm.fabs.f32(float)
17-
; KERNEL-IR: declare float @llvm.powi.f32(float, i32)
17+
1818

1919
; Check that kernel launch is generated in host IR.
2020
; the declare would not be generated unless a call to a kernel exists.
@@ -27,7 +27,6 @@
2727
; float tmp1 = sqrt(tmp1);
2828
; float tmp2 = fabs(tmp2);
2929
; float tmp3 = copysignf(tmp1, tmp2);
30-
; float tmp4 = powi(tmp3, 2);
3130
; B[i] = tmp4;
3231
; }
3332
; }
@@ -53,9 +52,8 @@ for.body: ; preds = %for.body.lr.ph, %fo
5352
%sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
5453
%fabs = tail call float @llvm.fabs.f32(float %sqrt);
5554
%copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
56-
%powi = tail call float @llvm.powi.f32(float %copysign, i32 2);
5755
%B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
58-
store float %powi, float* %B.arr.i, align 4
56+
store float %copysign, float* %B.arr.i, align 4
5957

6058
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
6159
%wide.trip.count = zext i32 %N to i64
@@ -73,7 +71,6 @@ for.end: ; preds = %for.cond.for.end_cr
7371
declare float @llvm.sqrt.f32(float) #0
7472
declare float @llvm.fabs.f32(float) #0
7573
declare float @llvm.copysign.f32(float, float) #0
76-
declare float @llvm.powi.f32(float, i32) #0
7774

7875
attributes #0 = { nounwind readnone }
7976

‎polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll

+12-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf)
2323
; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf)
2424

25+
; Powi and exp cannot be lowered directly. Rather, we expect them to be
26+
; lowered by libdevice.
27+
; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2)
28+
; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi)
29+
2530
; Check that kernel launch is generated in host IR.
2631
; the declare would not be generated unless a call to a kernel exists.
2732
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
@@ -33,6 +38,8 @@
3338
; float expf = expf(tmp1);
3439
; cosf = cosf(expf);
3540
; logf = logf(cosf);
41+
; powi = powi(logf, 2);
42+
; exp = exp(powi);
3643
; B[i] = logf;
3744
; }
3845
; }
@@ -58,8 +65,10 @@ for.body: ; preds = %for.body.lr.ph, %fo
5865
%expf = tail call float @expf(float %A.arr.i.val)
5966
%cosf = tail call float @cosf(float %expf)
6067
%logf = tail call float @logf(float %cosf)
68+
%powi = tail call float @llvm.powi.f32(float %logf, i32 2)
69+
%exp = tail call float @llvm.exp.f32(float %powi)
6170
%B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
62-
store float %logf, float* %B.arr.i, align 4
71+
store float %exp, float* %B.arr.i, align 4
6372

6473
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
6574
%wide.trip.count = zext i32 %N to i64
@@ -77,6 +86,8 @@ for.end: ; preds = %for.cond.for.end_cr
7786
declare float @expf(float) #0
7887
declare float @cosf(float) #0
7988
declare float @logf(float) #0
89+
declare float @llvm.powi.f32(float, i32) #0
90+
declare float @llvm.exp.f32(float) #0
8091

8192
attributes #0 = { nounwind readnone }
8293

0 commit comments

Comments
 (0)
Please sign in to comment.