Index: lib/CMakeLists.txt =================================================================== --- lib/CMakeLists.txt +++ lib/CMakeLists.txt @@ -106,6 +106,8 @@ LLVMipo LLVMMC LLVMPasses + LLVMLinker + LLVMIRReader ${nvptx_libs} # The libraries below are required for darwin: http://PR26392 LLVMBitReader Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -31,6 +31,8 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" @@ -102,6 +104,11 @@ cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt LibDevice( + "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden, + cl::init("/usr/local/cuda-7.5/nvvm/libdevice/libdevice.compute_20.10.bc"), + cl::ZeroOrMore, cl::cat(PollyCategory)); + static cl::opt CudaVersion("polly-acc-cuda-version", cl::desc("The CUDA version to compile for"), cl::Hidden, @@ -598,6 +605,9 @@ /// @param F The function to remove references to. void clearLoops(Function *F); + /// Link with the NVIDIA libdevice library (if needed and available). + void addLibDevice(); + /// Finalize the generation of the kernel function. /// /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- @@ -1300,13 +1310,32 @@ return isl_bool_true; } +/// A list of functions that are available in NVIDIA's libdevice. +std::vector LibDeviceFunctions = {"exp", "expf", "expl", "cos", + "cosf"}; + +/// Return the corresponding CUDA libdevice function name for @p F. +/// +/// Return "" if we are not compiling for CUDA. +std::string getLibDeviceFuntion(Function *F) { + for (auto Name : LibDeviceFunctions) + if (Name == F->getName()) + return "__nv_" + Name; + + return ""; +} + /// Check if F is a function that we can code-generate in a GPU kernel. -static bool isValidFunctionInKernel(llvm::Function *F) { +static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) { assert(F && "F is an invalid pointer"); // We string compare against the name of the function to allow // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and // "llvm.copysign". const StringRef Name = F->getName(); + + if (AllowLibDevice && getLibDeviceFuntion(F).length() > 0) + return true; + return F->isIntrinsic() && (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || Name.startswith("llvm.copysign")); @@ -1322,14 +1351,16 @@ /// Return `Function`s from `RawSubtreeValues`. static SetVector -getFunctionsFromRawSubtreeValues(SetVector RawSubtreeValues) { +getFunctionsFromRawSubtreeValues(SetVector RawSubtreeValues, + bool AllowLibDevice) { SetVector SubtreeFunctions; for (Value *It : RawSubtreeValues) { Function *F = dyn_cast(It); if (F) { - assert(isValidFunctionInKernel(F) && "Code should have bailed out by " - "this point if an invalid function " - "were present in a kernel."); + assert(isValidFunctionInKernel(F, AllowLibDevice) && + "Code should have bailed out by " + "this point if an invalid function " + "were present in a kernel."); SubtreeFunctions.insert(F); } } @@ -1383,8 +1414,11 @@ make_filter_range(SubtreeValues, isValidSubtreeValue); SetVector ValidSubtreeValues(ValidSubtreeValuesIt.begin(), ValidSubtreeValuesIt.end()); + + bool AllowLibDevice = Arch == GPUArch::NVPTX64; + SetVector ValidSubtreeFunctions( - getFunctionsFromRawSubtreeValues(SubtreeValues)); + getFunctionsFromRawSubtreeValues(SubtreeValues, AllowLibDevice)); // @see IslNodeBuilder::getReferencesInSubtree SetVector ReplacedValues; @@ -2079,6 +2113,44 @@ return ASMStream.str(); } +void GPUNodeBuilder::addLibDevice() { + if (Arch != GPUArch::NVPTX64) + return; + + bool RequiresLibDevice = false; + + for (Function &F : GPUModule->functions()) { + if (!F.isDeclaration()) + continue; + + std::string LibDeviceFunc = getLibDeviceFuntion(&F); + if (LibDeviceFunc.length() != 0) { + F.setName(LibDeviceFunc); + RequiresLibDevice = true; + } + } + + if (RequiresLibDevice) { + SMDiagnostic Error; + auto LibDeviceModule = + parseIRFile(LibDevice, Error, GPUModule->getContext()); + + if (!LibDeviceModule) { + BuildSuccessful = false; + errs() << "Could not find libdevice. Skipping GPU kernel generation. " + "Please set -polly-acc-libdevice accordingly.\n"; + return; + } + + Linker L(*GPUModule); + + // Set an nvptx64 target triple to avoid linker warnings. The original + // trible of the libdevice files are nvptx-unknown-unknown. + LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded); + } +} + std::string GPUNodeBuilder::finalizeKernelFunction() { if (verifyModule(*GPUModule)) { @@ -2092,6 +2164,8 @@ return ""; } + addLibDevice(); + if (DumpKernelIR) outs() << *GPUModule << "\n"; @@ -2943,10 +3017,12 @@ /// /// If this basic block does something with a `Function` other than calling /// a function that we support in a kernel, return true. - bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) { + bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB, + bool AllowLibDevice) { for (const Instruction &Inst : *BB) { const CallInst *Call = dyn_cast(&Inst); - if (Call && isValidFunctionInKernel(Call->getCalledFunction())) { + if (Call && + isValidFunctionInKernel(Call->getCalledFunction(), AllowLibDevice)) { continue; } @@ -2962,16 +3038,17 @@ } /// Return whether the Scop S uses functions in a way that we do not support. - bool containsInvalidKernelFunction(const Scop &S) { + bool containsInvalidKernelFunction(const Scop &S, bool AllowLibDevice) { for (auto &Stmt : S) { if (Stmt.isBlockStmt()) { - if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock())) + if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock(), + AllowLibDevice)) return true; } else { assert(Stmt.isRegionStmt() && "Stmt was neither block nor region statement"); for (const BasicBlock *BB : Stmt.getRegion()->blocks()) - if (containsInvalidKernelFunctionInBllock(BB)) + if (containsInvalidKernelFunctionInBllock(BB, AllowLibDevice)) return true; } } @@ -3056,7 +3133,8 @@ // kernel. This may lead to a kernel trying to call a function on the host. // This also allows us to prevent codegen from trying to take the // address of an intrinsic function to send to the kernel. - if (containsInvalidKernelFunction(CurrentScop)) { + if (containsInvalidKernelFunction(CurrentScop, + Architecture == GPUArch::NVPTX64)) { DEBUG( dbgs() << "Scop contains function which cannot be materialised in a GPU " Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll =================================================================== --- /dev/null +++ test/GPGPU/libdevice-functions-copied-into-kernel.ll @@ -0,0 +1,74 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s \ +; RUN: -polly-acc-libdevice=%S/libdevice-functions-copied-into-kernel_libdevice.bc \ +; RUN: | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -polly-acc-libdevice=%S/libdevice-functions-copied-into-kernel_libdevice.bc \ +; RUN: < %s | FileCheck %s --check-prefix=KERNEL-IR +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s \ +; RUN: -polly-acc-libdevice=%S/libdevice-functions-copied-into-kernel_libdevice.bc \ +; RUN: | FileCheck %s --check-prefix=HOST-IR + +; Test that we do recognise and codegen a kernel that has functions that can +; be mapped to NVIDIA's libdevice + +; REQUIRES: pollyacc + +; Check that we model the kernel as a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end + +; Check that the intrinsic call is present in the kernel IR. +; KERNEL-IR: %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_) + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + + +; void f(float *A, float *B, int N) { +; for(int i = 0; i < N; i++) { +; float tmp0 = A[i]; +; float tmp1 = expf(tmp1); +; B[i] = tmp1; +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, float* %B, i32 %N) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv + %A.arr.i.val = load float, float* %A.arr.i, align 4 + ; Call to intrinsics that should be part of the kernel. + %expf = tail call float @expf(float %A.arr.i.val) + %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv + store float %expf, float* %B.arr.i, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %N to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + +; Function Attrs: nounwind readnone +declare float @expf(float) #0 + +attributes #0 = { nounwind readnone } + Index: test/GPGPU/libdevice-functions-copied-into-kernel_libdevice.bc =================================================================== --- /dev/null +++ test/GPGPU/libdevice-functions-copied-into-kernel_libdevice.bc @@ -0,0 +1,6 @@ +definelayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define float @__nv_expf(float %a) #1 { + return %a +}