Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -152,10 +152,9 @@ option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF) if (POLLY_ENABLE_GPGPU_CODEGEN) - # Do not require CUDA/OpenCL, as GPU code generation test cases can be run - # without a CUDA/OpenCL library. + # Do not require CUDA, as GPU code generation test cases can be run without + # a cuda library. FIND_PACKAGE(CUDA) - FIND_PACKAGE(OpenCL) set(GPU_CODEGEN TRUE) else(POLLY_ENABLE_GPGPU_CODEGEN) set(GPU_CODEGEN FALSE) @@ -164,13 +163,8 @@ # Support GPGPU code generation if the library is available. if (CUDALIB_FOUND) - add_definitions(-DHAS_LIBCUDART) INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} ) endif(CUDALIB_FOUND) -if (OpenCL_FOUND) - add_definitions(-DHAS_LIBOPENCL) - INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) -endif(OpenCL_FOUND) option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) Index: include/polly/CodeGen/PPCGCodeGeneration.h =================================================================== --- include/polly/CodeGen/PPCGCodeGeneration.h +++ /dev/null @@ -1,24 +0,0 @@ -//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Take a scop created by ScopInfo and map it to GPU code using the ppcg -// GPU mapping strategy. -// -//===----------------------------------------------------------------------===// - -#ifndef POLLY_PPCGCODEGENERATION_H -#define POLLY_PPCGCODEGENERATION_H - -/// The GPU Architecture to target. -enum GPUArch { NVPTX64 }; - -/// The GPU Runtime implementation to use. -enum GPURuntime { CUDA, OpenCL }; - -#endif // POLLY_PPCGCODEGENERATION_H Index: include/polly/LinkAllPasses.h =================================================================== --- include/polly/LinkAllPasses.h +++ include/polly/LinkAllPasses.h @@ -15,7 +15,6 @@ #ifndef POLLY_LINKALLPASSES_H #define POLLY_LINKALLPASSES_H -#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/Config/config.h" #include "polly/PruneUnprofitable.h" #include "polly/Simplify.h" @@ -49,8 +48,7 @@ llvm::Pass *createIslAstInfoPass(); llvm::Pass *createCodeGenerationPass(); #ifdef GPU_CODEGEN -llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64, - GPURuntime Runtime = GPURuntime::CUDA); +llvm::Pass *createPPCGCodeGenerationPass(); #endif llvm::Pass *createIslScheduleOptimizerPass(); llvm::Pass *createFlattenSchedulePass(); Index: include/polly/ScopBuilder.h =================================================================== --- include/polly/ScopBuilder.h +++ include/polly/ScopBuilder.h @@ -54,6 +54,33 @@ // The Scop std::unique_ptr scop; + // Methods for pattern matching against Fortran code generated by dragonegg. + // @{ + + /// Try to pattern match and find the array descriptor structure in case of a + /// fortran array accesss. succeeds on load/store into a fortran array that + /// has been allocated. + /// + /// @see polly::FortranArrayDescriptor + /// + /// @param Inst The load/store instruction that access the memory. + /// + /// @note assumes -polly-canonicalize has been run. + GlobalValue *findFortranArrayDescriptorForAllocArrayAccess(MemAccInst Inst); + + /// Try to pattern match and find the array descriptor structure in case of a + /// fortran array accesss. succeeds on load/store into a fortran array that + /// has been allocated. + /// + /// @see polly::FortranArrayDescriptor + /// + /// @param Inst The load/store instruction that access the memory. + /// + /// @note assumes -polly-canonicalize has been run. + GlobalValue * + findFortranArrayDescriptorForNonAllocArrayAccess(MemAccInst Inst); + // @} + // Build the SCoP for Region @p R. void buildScop(Region &R, AssumptionCache &AC); Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -602,6 +602,13 @@ /// Updated access relation read from JSCOP file. isl_map *NewAccessRelation; + + /// Fortran arrays that are created using "Allocate" are stored in terms + /// of a descriptor struct. This maintains a raw pointer to the memory, + /// along with auxiliary fields with information such as dimensions. + /// We hold a reference to the descriptor corresponding to a MemoryAccess + /// into a Fortran array. FAD for "Fortran Array Descriptor" + AssertingVH FAD; // @} __isl_give isl_basic_map *createBasicAccessMap(ScopStmt *Statement); @@ -1006,6 +1013,10 @@ /// Get the reduction type of this access ReductionType getReductionType() const { return RedType; } + /// Set the array descriptor corresponding to the Array on which the + /// memory access is performed. + void setFortranArrayDescriptor(GlobalValue *FAD); + /// Update the original access relation. /// /// We need to update the original access relation during scop construction, Index: lib/Analysis/ScopBuilder.cpp =================================================================== --- lib/Analysis/ScopBuilder.cpp +++ lib/Analysis/ScopBuilder.cpp @@ -113,6 +113,159 @@ } } +/// This is matching against code generated by dragonegg after simplifier +/// passes have been run. +/// +/// This is trying to match against "@globaldescriptor", the descriptor +/// of the Fortran array that is being accessed at load/store. This style +/// of code is generated for arrays that have been allocated using "Allocate" +/// in the same module +/// +/// Pattern Match: +/// 1. %mallocmem = i8* @malloc(i64 40) +/// +/// 5. store i8* %mallocmem, i8** getelementptr inbounds +/// (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* +/// @globaldescriptor, i64 0, i32 0), align 32 +/// +/// 2. %typedmem = bitcast i8* %mallocmem to * +/// +/// 3 is optional because if you are writing to the 0th index, you don't +// need a GEP. +/// 3. [%slot = getelementptr inbounds i8, i8* %typedmem, i64 ] +/// +/// 4.1 store/load , * %typedmem, align 8 +/// 4.2 store/load , * %slot, align 8 +GlobalValue * +ScopBuilder::findFortranArrayDescriptorForAllocArrayAccess(MemAccInst Inst) { + // match: 4.1 & 4.2 store/load + if (!isa(Inst) && !isa(Inst)) + return nullptr; + + // match: 4 + if (Inst.getAlignment() != 8) + return nullptr; + + Value *Address = Inst.getPointerOperand(); + + const BitCastInst *Bitcast = nullptr; + // [match: 3] + if (auto *Slot = dyn_cast(Address)) { + Value *TypedMem = Slot->getPointerOperand(); + // match: 2 + Bitcast = dyn_cast(TypedMem); + } else { + // match: 2 + Bitcast = dyn_cast(Address); + } + + if (!Bitcast) + return nullptr; + + auto *MallocMem = Bitcast->getOperand(0); + + // match: 1 + auto *MallocCall = dyn_cast(MallocMem); + if (!MallocCall) + return nullptr; + + Function *MallocFn = MallocCall->getCalledFunction(); + if (!(MallocFn && MallocFn->hasName() && MallocFn->getName() == "malloc")) + return nullptr; + + // Find all uses the malloc'd memory. + // We are looking for a "store" into a struct with the type being the Fortran + // descriptor type + for (auto user : MallocMem->users()) { + + /// match: 5 + auto *MallocStore = dyn_cast(user); + if (!MallocStore) + continue; + + auto *DescriptorGEP = + dyn_cast(MallocStore->getPointerOperand()); + if (!DescriptorGEP) + continue; + + // match: 5 + auto DescriptorType = + dyn_cast(DescriptorGEP->getSourceElementType()); + if (!(DescriptorType && DescriptorType->hasName())) + continue; + + // name does not match expected name + if (!DescriptorType->getName().startswith("struct.array")) + continue; + + GlobalValue *Descriptor = + dyn_cast(DescriptorGEP->getPointerOperand()); + + if (!Descriptor) + continue; + + return Descriptor; + } + + return nullptr; +} + +/// This is matching against code generated by dragonegg after simplifier +/// passes have been run. +/// +/// This is trying to match against "@globaldescriptor", the descriptor +/// of the Fortran array that is being accessed at load/store. This style +/// of code is generated for arrays that have been declared global, and +/// are being accessed across modules +/// +/// Pattern Match: +/// 1. %mem = load double*, double** bitcast (%"struct.array1_real(kind=8)"* +/// @globaldescriptor to double**), align 32 +/// +/// 2 is optional because if you are writing to the 0th index, you don't +/// need a GEP. +/// 2. [%slot = getelementptr inbounds i8, i8* %mem, i64 ] +/// +/// 3.1 store/load , * %slot, align 8 +/// 3.2 store/load , * %mem, align 8 +GlobalValue * +ScopBuilder::findFortranArrayDescriptorForNonAllocArrayAccess(MemAccInst Inst) { + // match: 3 + if (!isa(Inst) && !isa(Inst)) + return nullptr; + + // match: 3 + if (Inst.getAlignment() != 8) + return nullptr; + + Value *Slot = Inst.getPointerOperand(); + + LoadInst *MemLoad = nullptr; + // [match: 2] + if (auto *SlotGEP = dyn_cast(Slot)) { + // match: 1 + MemLoad = dyn_cast(SlotGEP->getPointerOperand()); + } else { + // match: 1 + MemLoad = dyn_cast(Slot); + } + + if (!MemLoad) + return nullptr; + + auto *BitcastOperator = + dyn_cast(MemLoad->getPointerOperand()); + if (!BitcastOperator) + return nullptr; + + GlobalValue *Descriptor = + dyn_cast(BitcastOperator->getOperand(0)); + if (!Descriptor) + return nullptr; + + return Descriptor; +} + bool ScopBuilder::buildAccessMultiDimFixed(MemAccInst Inst, ScopStmt *Stmt) { Value *Val = Inst.getValueOperand(); Type *ElementType = Val->getType(); @@ -532,9 +685,17 @@ Type *ElementType, bool IsAffine, ArrayRef Subscripts, ArrayRef Sizes, Value *AccessValue) { ArrayBasePointers.insert(BaseAddress); - addMemoryAccess(MemAccInst->getParent(), MemAccInst, AccType, BaseAddress, - ElementType, IsAffine, AccessValue, Subscripts, Sizes, - MemoryKind::Array); + auto *MemAccess = addMemoryAccess( + MemAccInst->getParent(), MemAccInst, AccType, BaseAddress, ElementType, + IsAffine, AccessValue, Subscripts, Sizes, MemoryKind::Array); + + // TODO: change to loop of function pointers? + if (GlobalValue *FAD = + findFortranArrayDescriptorForAllocArrayAccess(MemAccInst)) + MemAccess->setFortranArrayDescriptor(FAD); + else if (GlobalValue *FAD = + findFortranArrayDescriptorForNonAllocArrayAccess(MemAccInst)) + MemAccess->setFortranArrayDescriptor(FAD); } void ScopBuilder::ensureValueWrite(Instruction *Inst) { Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -974,7 +974,7 @@ Sizes(Sizes.begin(), Sizes.end()), AccessInstruction(AccessInst), AccessValue(AccessValue), IsAffine(Affine), Subscripts(Subscripts.begin(), Subscripts.end()), AccessRelation(nullptr), - NewAccessRelation(nullptr) { + NewAccessRelation(nullptr), FAD(nullptr) { static const std::string TypeStrings[] = {"", "_Read", "_Write", "_MayWrite"}; const std::string Access = TypeStrings[AccType] + utostr(Stmt->size()); @@ -986,7 +986,8 @@ __isl_take isl_map *AccRel) : Kind(MemoryKind::Array), AccType(AccType), RedType(RT_NONE), Statement(Stmt), InvalidDomain(nullptr), AccessInstruction(nullptr), - IsAffine(true), AccessRelation(nullptr), NewAccessRelation(AccRel) { + IsAffine(true), AccessRelation(nullptr), NewAccessRelation(AccRel), + FAD(nullptr) { auto *ArrayInfoId = isl_map_get_tuple_id(NewAccessRelation, isl_dim_out); auto *SAI = ScopArrayInfo::getFromId(ArrayInfoId); Sizes.push_back(nullptr); @@ -1022,6 +1023,22 @@ return OS; } +void MemoryAccess::setFortranArrayDescriptor(GlobalValue *FAD) { + this->FAD = FAD; + +// TODO: write checks to make sure it looks _exactly_ like a Fortran array +// descriptor +#ifdef NDEBUG + StructType *ty = dyn_cast(Descriptor->getValueType()); + assert(ty && "expected value of type Fortran array descriptor"); + assert(ty->hasName() && ty->getName().startswith("struct.array") && + "expected global to follow Fortran array descriptor type naming " + "convention"); + assert(ty->getNumElements() == 4 && + "expected layout to be like Fortran array descriptor type"); +#endif +} + void MemoryAccess::print(raw_ostream &OS) const { switch (AccType) { case READ: @@ -1034,7 +1051,14 @@ OS.indent(12) << "MayWriteAccess :=\t"; break; } + OS << "[Reduction Type: " << getReductionType() << "] "; + + if (FAD) { + OS << "[Fortran array descriptor: " << FAD->getName(); + OS << "] "; + }; + OS << "[Scalar: " << isScalarKind() << "]\n"; OS.indent(16) << getOriginalAccessRelationStr() << ";\n"; if (hasNewAccessRelation()) Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/CodeGen/IslAst.h" #include "polly/CodeGen/IslNodeBuilder.h" #include "polly/CodeGen/Utils.h" @@ -154,9 +153,9 @@ GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, Scop &S, BasicBlock *StartBlock, - gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) + gpu_prog *Prog) : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), - Prog(Prog), Runtime(Runtime), Arch(Arch) { + Prog(Prog) { getExprBuilder().setIDToSAI(&IDToSAI); } @@ -202,12 +201,6 @@ /// The GPU program we generate code for. gpu_prog *Prog; - /// The GPU Runtime implementation to use (OpenCL or CUDA). - GPURuntime Runtime; - - /// The GPU Architecture to target. - GPUArch Arch; - /// Class to free isl_ids. class IslIdDeleter { public: @@ -759,17 +752,7 @@ } Value *GPUNodeBuilder::createCallInitContext() { - const char *Name; - - switch (Runtime) { - case GPURuntime::CUDA: - Name = "polly_initContextCUDA"; - break; - case GPURuntime::OpenCL: - Name = "polly_initContextCL"; - break; - } - + const char *Name = "polly_initContext"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -1045,15 +1028,7 @@ void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - - Function *Sync; - - switch (Arch) { - case GPUArch::NVPTX64: - Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); - break; - } - + auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); Builder.CreateCall(Sync, {}); } @@ -1459,12 +1434,7 @@ auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); - - switch (Arch) { - case GPUArch::NVPTX64: - FN->setCallingConv(CallingConv::PTX_Kernel); - break; - } + FN->setCallingConv(CallingConv::PTX_Kernel); auto Arg = FN->arg_begin(); for (long i = 0; i < Kernel->n_array; i++) { @@ -1525,19 +1495,12 @@ } void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { - Intrinsic::ID IntrinsicsBID[2]; - Intrinsic::ID IntrinsicsTID[3]; - - switch (Arch) { - case GPUArch::NVPTX64: - IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; - IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; + Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, + Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; - IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; - IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; - IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; - break; - } + Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, + Intrinsic::nvvm_read_ptx_sreg_tid_y, + Intrinsic::nvvm_read_ptx_sreg_tid_z}; auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { std::string Name = isl_id_get_name(Id); @@ -1686,18 +1649,11 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, SetVector &SubtreeValues) { + std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); - - switch (Arch) { - case GPUArch::NVPTX64: - if (Runtime == GPURuntime::CUDA) - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); - else if (Runtime == GPURuntime::OpenCL) - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); - GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); - break; - } + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -1718,21 +1674,7 @@ } std::string GPUNodeBuilder::createKernelASM() { - llvm::Triple GPUTriple; - - switch (Arch) { - case GPUArch::NVPTX64: - switch (Runtime) { - case GPURuntime::CUDA: - GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); - break; - case GPURuntime::OpenCL: - GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); - break; - } - break; - } - + llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); std::string ErrMsg; auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); @@ -1743,17 +1685,9 @@ TargetOptions Options; Options.UnsafeFPMath = FastMath; - - std::string subtarget; - - switch (Arch) { - case GPUArch::NVPTX64: - subtarget = CudaVersion; - break; - } - - std::unique_ptr TargetM(GPUTarget->createTargetMachine( - GPUTriple.getTriple(), subtarget, "", Options, Optional())); + std::unique_ptr TargetM( + GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", + Options, Optional())); SmallString<0> ASMString; raw_svector_ostream ASMStream(ASMString); @@ -1805,10 +1739,6 @@ public: static char ID; - GPURuntime Runtime = GPURuntime::CUDA; - - GPUArch Architecture = GPUArch::NVPTX64; - /// The scop that is currently processed. Scop *S; @@ -2592,7 +2522,7 @@ executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, - StartBlock, Prog, Runtime, Architecture); + StartBlock, Prog); // TODO: Handle LICM auto SplitBlock = StartBlock->getSinglePredecessor(); @@ -2680,12 +2610,7 @@ char PPCGCodeGeneration::ID = 1; -Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { - PPCGCodeGeneration *generator = new PPCGCodeGeneration(); - generator->Runtime = Runtime; - generator->Architecture = Arch; - return generator; -} +Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", "Polly - Apply PPCG translation to SCOP", false, false) Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -23,7 +23,6 @@ #include "polly/Canonicalization.h" #include "polly/CodeGen/CodeGeneration.h" #include "polly/CodeGen/CodegenCleanup.h" -#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/DeLICM.h" #include "polly/DependenceInfo.h" #include "polly/FlattenSchedule.h" @@ -102,23 +101,6 @@ ), cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory)); -#ifdef GPU_CODEGEN -static cl::opt GPURuntimeChoice( - "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"), - cl::values(clEnumValN(GPURuntime::CUDA, "libcudart", - "use the CUDA Runtime API"), - clEnumValN(GPURuntime::OpenCL, "libopencl", - "use the OpenCL Runtime API")), - cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory)); - -static cl::opt - GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), - cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", - "target NVIDIA 64-bit architecture")), - cl::init(GPUArch::NVPTX64), cl::ZeroOrMore, - cl::cat(PollyCategory)); -#endif - VectorizerChoice polly::PollyVectorizerChoice; static cl::opt Vectorizer( "polly-vectorizer", cl::desc("Select the vectorization strategy"), @@ -327,8 +309,7 @@ if (Target == TARGET_GPU) { #ifdef GPU_CODEGEN - PM.add( - polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice)); + PM.add(polly::createPPCGCodeGenerationPass()); #endif } else { switch (CodeGeneration) { Index: test/FortranDetection/global-malloc-nonvectored.ll =================================================================== --- /dev/null +++ test/FortranDetection/global-malloc-nonvectored.ll @@ -0,0 +1,143 @@ +; RUN: opt -S -analyze -polly-process-unprofitable -polly-remarks-minimal \ +; RUN: -polly-canonicalize -polly-scops -polly-dependences \ +; RUN: -debug-only=polly-dependence -polly-canonicalize -polly-allow-nonaffine \ +; RUN: -polly-ignore-aliasing -polly-invariant-load-hoisting \ +; RUN: < %s| FileCheck %s +; +; MODULE src_soil +; USE data_parameters, ONLY : & +; wp, & ! KIND-type parameter for real variables +; iintegers ! KIND-type parameter for standard integer variables +; IMPLICIT NONE +; REAL (KIND = wp), ALLOCATABLE, PRIVATE :: & +; xdzs (:) +; CONTAINS +; SUBROUTINE terra1(n) +; INTEGER, intent(in) :: n +; INTEGER (KIND=iintegers) :: & +; j +; Allocate(xdzs(n)); +; DO j = 2, n +; xdzs(j) = xdzs(j) * xdzs(j) + xdzs(j - 1) +; END DO +; END SUBROUTINE terra1 +; END MODULE src_soil + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +module asm "\09.ident\09\22GCC: (GNU) 4.6.4 LLVM: 3.3.1\22" + +%"struct.array1_real(kind=8)" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] } +%struct.descriptor_dimension = type { i64, i64, i64 } + +@__src_soil_MOD_xdzs = unnamed_addr global %"struct.array1_real(kind=8)" zeroinitializer, align 32 +@.cst = private unnamed_addr constant [67 x i8] c"Integer overflow when calculating the amount of memory to allocate\00", align 64 +@.cst1 = private unnamed_addr constant [37 x i8] c"Allocation would exceed memory limit\00", align 64 +@.cst2 = private unnamed_addr constant [93 x i8] c"At line 23 of file /home/siddhart/cosmo-self-installation/cosmo-pompa/cosmo/src/src_soil.f90\00", align 64 +@.cst3 = private unnamed_addr constant [55 x i8] c"Attempting to allocate already allocated variable '%s'\00", align 64 +@.cst4 = private unnamed_addr constant [5 x i8] c"xdzs\00", align 8 + +; Function Attrs: nounwind uwtable +define void @__src_soil_MOD_terra1(i32* noalias nocapture %n) unnamed_addr #0 { +entry: + store i64 537, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 2), align 16, !tbaa !0 + store i64 1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 3, i64 0, i32 1), align 8, !tbaa !0 + %0 = load i32, i32* %n, align 4, !tbaa !3 + %1 = sext i32 %0 to i64 + store i64 %1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 3, i64 0, i32 2), align 8, !tbaa !0 + store i64 1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 3, i64 0, i32 0), align 8, !tbaa !0 + %2 = icmp slt i32 %0, 0 + %3 = select i1 %2, i64 0, i64 %1 + %4 = icmp eq i64 %3, 0 + br i1 %4, label %"16", label %"8" + +"8": ; preds = %entry + %5 = sdiv i64 9223372036854775807, %1 + %6 = icmp slt i64 %5, 1 + %7 = icmp slt i32 %0, 1 + %8 = shl nsw i64 %3, 3 + %.2 = select i1 %7, i64 0, i64 %8 + br i1 %6, label %"15", label %"16" + +"15": ; preds = %"8" + + unreachable + +"16": ; preds = %"8", %entry + %.24 = phi i64 [ %.2, %"8" ], [ 0, %entry ] + %9 = load i8*, i8** getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 0), align 32, !tbaa !5 + %10 = icmp eq i8* %9, null + br i1 %10, label %"17", label %"20" + +"17": ; preds = %"16" + %11 = icmp ne i64 %.24, 0 + %12 = select i1 %11, i64 %.24, i64 1 + %13 = tail call noalias i8* @malloc(i64 %12) #2 ;<= 1. malloc + %14 = icmp eq i8* %13, null + br i1 %14, label %"18", label %"19" + +"18": ; preds = %"17" + unreachable + +"19": ; preds = %"17" + store i8* %13, i8** getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 0), align 32, !tbaa !5 + store i64 -1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 1), align 8, !tbaa !0 + %15 = icmp sgt i32 %0, 1 + br i1 %15, label %"21.preheader", label %return + +"21.preheader": ; preds = %"19" + %16 = bitcast i8* %13 to double* ;<= 2. bitcast to double* + %17 = add i32 %0, 1 + br label %"21" + +"20": ; preds = %"16" + unreachable + +"21": ; preds = %"21", %"21.preheader" + %18 = phi double [ undef, %"21.preheader" ], [ %23, %"21" ] + %indvars.iv = phi i64 [ 2, %"21.preheader" ], [ %indvars.iv.next, %"21" ] + %19 = add nsw i64 %indvars.iv, -1 + %20 = getelementptr inbounds double, double* %16, i64 %19 ;<= 3. GEP + %21 = load double, double* %20, align 8, !tbaa !7 + %22 = fmul double %21, %21 + %23 = fadd double %22, %18 + store double %23, double* %20, align 8, !tbaa !7 ;<= 4. store + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %17 + br i1 %exitcond, label %return, label %"21" + +return: ; preds = %"21", %"19" + ret void +} + +; Function Attrs: noreturn +declare void @_gfortran_runtime_error(i8*, ...) #1 + +; Function Attrs: nounwind +declare noalias i8* @malloc(i64) #2 + +; Function Attrs: noreturn +declare void @_gfortran_os_error(i8*) #1 + +; Function Attrs: noreturn +declare void @_gfortran_runtime_error_at(i8*, i8*, ...) #1 + +attributes #0 = { nounwind uwtable } +attributes #1 = { noreturn } +attributes #2 = { nounwind } +attributes #3 = { noreturn nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"alias set 4: integer(kind=8)", !2} +!2 = distinct !{!2} +!3 = !{!4, !4, i64 0} +!4 = !{!"alias set 11: integer(kind=4)", !2} +!5 = !{!6, !6, i64 0} +!6 = !{!"alias set 3: void*", !2} +!7 = !{!8, !8, i64 0} +!8 = !{!"alias set 18: real(kind=8)", !2} + +; CHECK: ReadAccess := [Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0] +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0] Index: test/FortranDetection/global-nonmalloc-nonvectored.ll =================================================================== --- /dev/null +++ test/FortranDetection/global-nonmalloc-nonvectored.ll @@ -0,0 +1,87 @@ +; RUN: opt -S -analyze -polly-process-unprofitable -polly-remarks-minimal \ +; RUN: -polly-canonicalize -polly-scops -polly-dependences \ +; RUN: -debug-only=polly-dependence -polly-canonicalize -polly-allow-nonaffine \ +; RUN: -polly-ignore-aliasing -polly-invariant-load-hoisting \ +; RUN: < %s| FileCheck %s +; +; MODULE src_soil +; USE data_parameters, ONLY : & +; wp, & ! KIND-type parameter for real variables +; iintegers ! KIND-type parameter for standard integer variables +; IMPLICIT NONE +; REAL (KIND = wp), ALLOCATABLE, PRIVATE :: & +; xdzs (:) +; CONTAINS +; +; SUBROUTINE terra1(n) +; INTEGER, intent(in) :: n +; +; INTEGER (KIND=iintegers) :: & +; j +; +; DO j = 22, n +; xdzs(j) = xdzs(j) * xdzs(j) + xdzs(j - 1) +; END DO +; END SUBROUTINE terra1 +; END MODULE src_soil + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +module asm "\09.ident\09\22GCC: (GNU) 4.6.4 LLVM: 3.3.1\22" + +%"struct.array1_real(kind=8)" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] } +%struct.descriptor_dimension = type { i64, i64, i64 } + +@__src_soil_MOD_xdzs = unnamed_addr global %"struct.array1_real(kind=8)" zeroinitializer, align 32 + +; Function Attrs: nounwind uwtable +define void @__src_soil_MOD_terra1(i32* noalias nocapture %n) unnamed_addr #0 { +entry: + %0 = load i32, i32* %n, align 4, !tbaa !0 + %1 = icmp sgt i32 %0, 21 + br i1 %1, label %"3.preheader", label %return + +"3.preheader": ; preds = %entry + %2 = load i64, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 1), align 8, !tbaa !3 + %3 = load i8*, i8** getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 0), align 32, !tbaa !5 + %4 = bitcast i8* %3 to double* + %5 = add i32 %0, 1 + br label %"3" + +"3": ; preds = %"3", %"3.preheader" + %indvars.iv = phi i64 [ 22, %"3.preheader" ], [ %indvars.iv.next, %"3" ] + %6 = add nsw i64 %indvars.iv, %2 + %7 = getelementptr inbounds double, double* %4, i64 %6 + %8 = load double, double* %7, align 8, !tbaa !7 + %9 = fmul double %8, %8 + %10 = add nsw i64 %indvars.iv, -1 + %11 = add nsw i64 %10, %2 + %12 = getelementptr inbounds double, double* %4, i64 %11 + %13 = load double, double* %12, align 8, !tbaa !7 + %14 = fadd double %9, %13 + store double %14, double* %7, align 8, !tbaa !7 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %5 + br i1 %exitcond, label %return, label %"3" + +return: ; preds = %"3", %entry + ret void +} + +attributes #0 = { nounwind uwtable } + +!0 = !{!1, !1, i64 0} +!1 = !{!"alias set 11: integer(kind=4)", !2} +!2 = distinct !{!2} +!3 = !{!4, !4, i64 0} +!4 = !{!"alias set 4: integer(kind=8)", !2} +!5 = !{!6, !6, i64 0} +!6 = !{!"alias set 3: void*", !2} +!7 = !{!8, !8, i64 0} +!8 = !{!"alias set 18: real(kind=8)", !2} + +; CHECK: ReadAccess := [Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0] +; CHECK: ReadAccess := [Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0] +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0] Index: test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- test/GPGPU/cuda-managed-memory-simple.ll +++ test/GPGPU/cuda-managed-memory-simple.ll @@ -35,7 +35,7 @@ ; CHECK-NOT: polly_freeDeviceMemory ; CHECK-NOT: polly_allocateMemoryForDevice -; CHECK: %13 = call i8* @polly_initContextCUDA() +; CHECK: %13 = call i8* @polly_initContext() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* ; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 @@ -46,7 +46,7 @@ ; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) +; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) ; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) ; CHECK-NEXT: call void @polly_freeKernel(i8* %20) ; CHECK-NEXT: call void @polly_synchronizeDevice() Index: test/GPGPU/size-cast.ll =================================================================== --- test/GPGPU/size-cast.ll +++ test/GPGPU/size-cast.ll @@ -29,7 +29,7 @@ ; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1) ; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0); -; IR: call i8* @polly_initContextCUDA() +; IR: call i8* @polly_initContext() ; IR-NEXT: sext i32 %arg to i64 ; IR-NEXT: mul i64 ; IR-NEXT: @polly_allocateMemoryForDevice Index: tools/CMakeLists.txt =================================================================== --- tools/CMakeLists.txt +++ tools/CMakeLists.txt @@ -1,5 +1,5 @@ -if (CUDALIB_FOUND OR OpenCL_FOUND) +if (CUDALIB_FOUND) add_subdirectory(GPURuntime) -endif (CUDALIB_FOUND OR OpenCL_FOUND) +endif (CUDALIB_FOUND) set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) Index: tools/GPURuntime/GPUJIT.h =================================================================== --- tools/GPURuntime/GPUJIT.h +++ tools/GPURuntime/GPUJIT.h @@ -76,27 +76,12 @@ * */ -typedef enum PollyGPURuntimeT { - RUNTIME_NONE, - RUNTIME_CUDA, - RUNTIME_CL -} PollyGPURuntime; - typedef struct PollyGPUContextT PollyGPUContext; typedef struct PollyGPUFunctionT PollyGPUFunction; typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; -typedef struct OpenCLContextT OpenCLContext; -typedef struct OpenCLKernelT OpenCLKernel; -typedef struct OpenCLDevicePtrT OpenCLDevicePtr; - -typedef struct CUDAContextT CUDAContext; -typedef struct CUDAKernelT CUDAKernel; -typedef struct CUDADevicePtrT CUDADevicePtr; - -PollyGPUContext *polly_initContextCUDA(); -PollyGPUContext *polly_initContextCL(); -PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, +PollyGPUContext *polly_initContext(); +PollyGPUFunction *polly_getKernel(const char *PTXBuffer, const char *KernelName); void polly_freeKernel(PollyGPUFunction *Kernel); void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -12,20 +12,8 @@ /******************************************************************************/ #include "GPUJIT.h" - -#ifdef HAS_LIBCUDART #include #include -#endif /* HAS_LIBCUDART */ - -#ifdef HAS_LIBOPENCL -#ifdef __APPLE__ -#include -#else -#include -#endif -#endif /* HAS_LIBOPENCL */ - #include #include #include @@ -34,8 +22,6 @@ static int DebugMode; static int CacheMode; -static PollyGPURuntime Runtime = RUNTIME_NONE; - static void debug_print(const char *format, ...) { if (!DebugMode) return; @@ -47,853 +33,18 @@ } #define dump_function() debug_print("-> %s\n", __func__) -#define KERNEL_CACHE_SIZE 10 - -static void err_runtime() { - fprintf(stderr, "Runtime not correctly initialized.\n"); - exit(-1); -} - +/* Define Polly's GPGPU data types. */ struct PollyGPUContextT { - void *Context; -}; - -struct PollyGPUFunctionT { - void *Kernel; -}; - -struct PollyGPUDevicePtrT { - void *DevicePtr; -}; - -/******************************************************************************/ -/* OpenCL */ -/******************************************************************************/ -#ifdef HAS_LIBOPENCL - -struct OpenCLContextT { - cl_context Context; - cl_command_queue CommandQueue; -}; - -struct OpenCLKernelT { - cl_kernel Kernel; - cl_program Program; - const char *BinaryString; -}; - -struct OpenCLDevicePtrT { - cl_mem MemObj; -}; - -/* Dynamic library handles for the OpenCL runtime library. */ -static void *HandleOpenCL; - -/* Type-defines of function pointer to OpenCL Runtime API. */ -typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, - cl_platform_id *Platforms, - cl_uint *NumPlatforms); -static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; - -typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform, - cl_device_type DeviceType, - cl_uint NumEntries, cl_device_id *Devices, - cl_uint *NumDevices); -static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; - -typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device, - cl_device_info ParamName, - size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet); -static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; - -typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName, - size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet); -static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; - -typedef cl_context clCreateContextFcnTy( - const cl_context_properties *Properties, cl_uint NumDevices, - const cl_device_id *Devices, - void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo, - size_t CB, void *UserData), - void *UserData, cl_int *ErrcodeRet); -static clCreateContextFcnTy *clCreateContextFcnPtr; - -typedef cl_command_queue -clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device, - cl_command_queue_properties Properties, - cl_int *ErrcodeRet); -static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; - -typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags, - size_t Size, void *HostPtr, - cl_int *ErrcodeRet); -static clCreateBufferFcnTy *clCreateBufferFcnPtr; - -typedef cl_int -clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer, - cl_bool BlockingWrite, size_t Offset, size_t Size, - const void *Ptr, cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, cl_event *Event); -static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; - -typedef cl_program clCreateProgramWithBinaryFcnTy( - cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, - const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, - cl_int *ErrcodeRet); -static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; - -typedef cl_int clBuildProgramFcnTy( - cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList, - const char *Options, - void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData), - void *UserData); -static clBuildProgramFcnTy *clBuildProgramFcnPtr; - -typedef cl_kernel clCreateKernelFcnTy(cl_program Program, - const char *KernelName, - cl_int *ErrcodeRet); -static clCreateKernelFcnTy *clCreateKernelFcnPtr; - -typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex, - size_t ArgSize, const void *ArgValue); -static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; - -typedef cl_int clEnqueueNDRangeKernelFcnTy( - cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim, - const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, - const size_t *LocalWorkSize, cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, cl_event *Event); -static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; - -typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue, - cl_mem Buffer, cl_bool BlockingRead, - size_t Offset, size_t Size, void *Ptr, - cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, - cl_event *Event); -static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; - -typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue); -static clFlushFcnTy *clFlushFcnPtr; - -typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue); -static clFinishFcnTy *clFinishFcnPtr; - -typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel); -static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; - -typedef cl_int clReleaseProgramFcnTy(cl_program Program); -static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; - -typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject); -static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; - -typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue); -static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; - -typedef cl_int clReleaseContextFcnTy(cl_context Context); -static clReleaseContextFcnTy *clReleaseContextFcnPtr; - -static void *getAPIHandleCL(void *Handle, const char *FuncName) { - char *Err; - void *FuncPtr; - dlerror(); - FuncPtr = dlsym(Handle, FuncName); - if ((Err = dlerror()) != 0) { - fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err); - return 0; - } - return FuncPtr; -} - -static int initialDeviceAPILibrariesCL() { - HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); - if (!HandleOpenCL) { - fprintf(stderr, "Cannot open library: %s. \n", dlerror()); - return 0; - } - return 1; -} - -static int initialDeviceAPIsCL() { - if (initialDeviceAPILibrariesCL() == 0) - return 0; - - /* Get function pointer to OpenCL Runtime API. - * - * Note that compilers conforming to the ISO C standard are required to - * generate a warning if a conversion from a void * pointer to a function - * pointer is attempted as in the following statements. The warning - * of this kind of cast may not be emitted by clang and new versions of gcc - * as it is valid on POSIX 2008. - */ - clGetPlatformIDsFcnPtr = - (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs"); - - clGetDeviceIDsFcnPtr = - (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs"); - - clGetDeviceInfoFcnPtr = - (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo"); - - clGetKernelInfoFcnPtr = - (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo"); - - clCreateContextFcnPtr = - (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext"); - - clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateCommandQueue"); - - clCreateBufferFcnPtr = - (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer"); - - clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueWriteBuffer"); - - clCreateProgramWithBinaryFcnPtr = - (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateProgramWithBinary"); - - clBuildProgramFcnPtr = - (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram"); - - clCreateKernelFcnPtr = - (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel"); - - clSetKernelArgFcnPtr = - (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg"); - - clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueNDRangeKernel"); - - clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueReadBuffer"); - - clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush"); - - clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish"); - - clReleaseKernelFcnPtr = - (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel"); - - clReleaseProgramFcnPtr = - (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram"); - - clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseMemObject"); - - clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseCommandQueue"); - - clReleaseContextFcnPtr = - (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext"); - - return 1; -} - -/* Context and Device. */ -static PollyGPUContext *GlobalContext = NULL; -static cl_device_id GlobalDeviceID = NULL; - -/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ -static void printOpenCLError(int Error); - -static void checkOpenCLError(int Ret, const char *format, ...) { - if (Ret == CL_SUCCESS) - return; - - printOpenCLError(Ret); - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - exit(-1); -} - -static PollyGPUContext *initContextCL() { - dump_function(); - - PollyGPUContext *Context; - - cl_platform_id PlatformID = NULL; - cl_device_id DeviceID = NULL; - cl_uint NumDevicesRet; - cl_int Ret; - - char DeviceRevision[256]; - char DeviceName[256]; - size_t DeviceRevisionRetSize, DeviceNameRetSize; - - static __thread PollyGPUContext *CurrentContext = NULL; - - if (CurrentContext) - return CurrentContext; - - /* Get API handles. */ - if (initialDeviceAPIsCL() == 0) { - fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); - exit(-1); - } - - /* Get number of devices that support OpenCL. */ - static const int NumberOfPlatforms = 1; - Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL); - checkOpenCLError(Ret, "Failed to get platform IDs.\n"); - // TODO: Extend to CL_DEVICE_TYPE_ALL? - static const int NumberOfDevices = 1; - Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices, - &DeviceID, &NumDevicesRet); - checkOpenCLError(Ret, "Failed to get device IDs.\n"); - - GlobalDeviceID = DeviceID; - if (NumDevicesRet == 0) { - fprintf(stderr, "There is no device supporting OpenCL.\n"); - exit(-1); - } - - /* Get device revision. */ - Ret = - clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision), - DeviceRevision, &DeviceRevisionRetSize); - checkOpenCLError(Ret, "Failed to fetch device revision.\n"); - - /* Get device name. */ - Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName), - DeviceName, &DeviceNameRetSize); - checkOpenCLError(Ret, "Failed to fetch device name.\n"); - - debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); - - /* Create context on the device. */ - Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); - if (Context == 0) { - fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); - exit(-1); - } - Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); - if (Context->Context == 0) { - fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n"); - exit(-1); - } - ((OpenCLContext *)Context->Context)->Context = - clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret); - checkOpenCLError(Ret, "Failed to create context.\n"); - - static const int ExtraProperties = 0; - ((OpenCLContext *)Context->Context)->CommandQueue = - clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context, - DeviceID, ExtraProperties, &Ret); - checkOpenCLError(Ret, "Failed to create command queue.\n"); - - if (CacheMode) - CurrentContext = Context; - - GlobalContext = Context; - return Context; -} - -static void freeKernelCL(PollyGPUFunction *Kernel) { - dump_function(); - - if (CacheMode) - return; - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - cl_int Ret; - Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); - checkOpenCLError(Ret, "Failed to flush command queue.\n"); - Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); - checkOpenCLError(Ret, "Failed to finish command queue.\n"); - - if (((OpenCLKernel *)Kernel->Kernel)->Kernel) { - cl_int Ret = - clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel); - checkOpenCLError(Ret, "Failed to release kernel.\n"); - } - - if (((OpenCLKernel *)Kernel->Kernel)->Program) { - cl_int Ret = - clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program); - checkOpenCLError(Ret, "Failed to release program.\n"); - } - - if (Kernel->Kernel) - free((OpenCLKernel *)Kernel->Kernel); - - if (Kernel) - free(Kernel); -} - -static PollyGPUFunction *getKernelCL(const char *BinaryBuffer, - const char *KernelName) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; - static __thread int NextCacheItem = 0; - - for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { - // We exploit here the property that all Polly-ACC kernels are allocated - // as global constants, hence a pointer comparision is sufficient to - // determin equality. - if (KernelCache[i] && - ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString == - BinaryBuffer) { - debug_print(" -> using cached kernel\n"); - return KernelCache[i]; - } - } - - PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); - if (Function == 0) { - fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); - exit(-1); - } - Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); - if (Function->Kernel == 0) { - fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n"); - exit(-1); - } - - if (!GlobalDeviceID) { - fprintf(stderr, "GPGPU-code generation not initialized correctly.\n"); - exit(-1); - } - - cl_int Ret; - size_t BinarySize = strlen(BinaryBuffer); - ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID, - (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL, - &Ret); - checkOpenCLError(Ret, "Failed to create program from binary.\n"); - - Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, - &GlobalDeviceID, NULL, NULL, NULL); - checkOpenCLError(Ret, "Failed to build program.\n"); - - ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr( - ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret); - checkOpenCLError(Ret, "Failed to create kernel.\n"); - - ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer; - - if (CacheMode) { - if (KernelCache[NextCacheItem]) - freeKernelCL(KernelCache[NextCacheItem]); - - KernelCache[NextCacheItem] = Function; - - NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; - } - - return Function; -} - -static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - cl_int Ret; - Ret = clEnqueueWriteBufferFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->CommandQueue, - ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, - HostData, 0, NULL, NULL); - checkOpenCLError(Ret, "Copying data from host memory to device failed.\n"); -} - -static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - cl_int Ret; - Ret = clEnqueueReadBufferFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->CommandQueue, - ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, - HostData, 0, NULL, NULL); - checkOpenCLError(Ret, "Copying results from device to host memory failed.\n"); -} - -static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { - dump_function(); - - cl_int Ret; - cl_uint NumArgs; - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel; - Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS, - sizeof(cl_uint), &NumArgs, NULL); - checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); - - // TODO: Pass the size of the kernel arguments in to launchKernelCL, along - // with the arguments themselves. This is a dirty workaround that can be - // broken. - for (cl_uint i = 0; i < NumArgs; i++) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = - clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1, - (void *)Parameters[i]); - checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); - } - } - } - if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) { - fprintf(stderr, "Failed to set Kernel argument.\n"); - printOpenCLError(Ret); - exit(-1); - } - } - - unsigned int GridDimZ = 1; - size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY, - BlockDimZ * GridDimZ}; - size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ}; - - static const int WorkDim = 3; - OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; - Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel, - WorkDim, NULL, GlobalWorkSize, - LocalWorkSize, 0, NULL, NULL); - checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n"); -} - -static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) { - dump_function(); - - OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; - cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj); - checkOpenCLError(Ret, "Failed to free device memory.\n"); - - free(DevPtr); - free(Allocation); -} - -static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); - if (DevData == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr)); - if (DevData->DevicePtr == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - - cl_int Ret; - ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj = - clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context, - CL_MEM_READ_WRITE, MemSize, NULL, &Ret); - checkOpenCLError(Ret, - "Allocate memory for GPU device memory pointer failed.\n"); - - return DevData; -} - -static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) { - dump_function(); - - OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; - return (void *)DevPtr->MemObj; -} - -static void synchronizeDeviceCL() { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) != - CL_SUCCESS) { - fprintf(stderr, "Synchronizing device and host memory failed.\n"); - exit(-1); - } -} - -static void freeContextCL(PollyGPUContext *Context) { - dump_function(); - - cl_int Ret; - - GlobalContext = NULL; - - OpenCLContext *Ctx = (OpenCLContext *)Context->Context; - if (Ctx->CommandQueue) { - Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue); - checkOpenCLError(Ret, "Could not release command queue.\n"); - } - - if (Ctx->Context) { - Ret = clReleaseContextFcnPtr(Ctx->Context); - checkOpenCLError(Ret, "Could not release context.\n"); - } - - free(Ctx); - free(Context); -} - -static void printOpenCLError(int Error) { - - switch (Error) { - case CL_SUCCESS: - // Success, don't print an error. - break; - - // JIT/Runtime errors. - case CL_DEVICE_NOT_FOUND: - fprintf(stderr, "Device not found.\n"); - break; - case CL_DEVICE_NOT_AVAILABLE: - fprintf(stderr, "Device not available.\n"); - break; - case CL_COMPILER_NOT_AVAILABLE: - fprintf(stderr, "Compiler not available.\n"); - break; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - fprintf(stderr, "Mem object allocation failure.\n"); - break; - case CL_OUT_OF_RESOURCES: - fprintf(stderr, "Out of resources.\n"); - break; - case CL_OUT_OF_HOST_MEMORY: - fprintf(stderr, "Out of host memory.\n"); - break; - case CL_PROFILING_INFO_NOT_AVAILABLE: - fprintf(stderr, "Profiling info not available.\n"); - break; - case CL_MEM_COPY_OVERLAP: - fprintf(stderr, "Mem copy overlap.\n"); - break; - case CL_IMAGE_FORMAT_MISMATCH: - fprintf(stderr, "Image format mismatch.\n"); - break; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - fprintf(stderr, "Image format not supported.\n"); - break; - case CL_BUILD_PROGRAM_FAILURE: - fprintf(stderr, "Build program failure.\n"); - break; - case CL_MAP_FAILURE: - fprintf(stderr, "Map failure.\n"); - break; - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - fprintf(stderr, "Misaligned sub buffer offset.\n"); - break; - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - fprintf(stderr, "Exec status error for events in wait list.\n"); - break; - case CL_COMPILE_PROGRAM_FAILURE: - fprintf(stderr, "Compile program failure.\n"); - break; - case CL_LINKER_NOT_AVAILABLE: - fprintf(stderr, "Linker not available.\n"); - break; - case CL_LINK_PROGRAM_FAILURE: - fprintf(stderr, "Link program failure.\n"); - break; - case CL_DEVICE_PARTITION_FAILED: - fprintf(stderr, "Device partition failed.\n"); - break; - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - fprintf(stderr, "Kernel arg info not available.\n"); - break; - - // Compiler errors. - case CL_INVALID_VALUE: - fprintf(stderr, "Invalid value.\n"); - break; - case CL_INVALID_DEVICE_TYPE: - fprintf(stderr, "Invalid device type.\n"); - break; - case CL_INVALID_PLATFORM: - fprintf(stderr, "Invalid platform.\n"); - break; - case CL_INVALID_DEVICE: - fprintf(stderr, "Invalid device.\n"); - break; - case CL_INVALID_CONTEXT: - fprintf(stderr, "Invalid context.\n"); - break; - case CL_INVALID_QUEUE_PROPERTIES: - fprintf(stderr, "Invalid queue properties.\n"); - break; - case CL_INVALID_COMMAND_QUEUE: - fprintf(stderr, "Invalid command queue.\n"); - break; - case CL_INVALID_HOST_PTR: - fprintf(stderr, "Invalid host pointer.\n"); - break; - case CL_INVALID_MEM_OBJECT: - fprintf(stderr, "Invalid memory object.\n"); - break; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - fprintf(stderr, "Invalid image format descriptor.\n"); - break; - case CL_INVALID_IMAGE_SIZE: - fprintf(stderr, "Invalid image size.\n"); - break; - case CL_INVALID_SAMPLER: - fprintf(stderr, "Invalid sampler.\n"); - break; - case CL_INVALID_BINARY: - fprintf(stderr, "Invalid binary.\n"); - break; - case CL_INVALID_BUILD_OPTIONS: - fprintf(stderr, "Invalid build options.\n"); - break; - case CL_INVALID_PROGRAM: - fprintf(stderr, "Invalid program.\n"); - break; - case CL_INVALID_PROGRAM_EXECUTABLE: - fprintf(stderr, "Invalid program executable.\n"); - break; - case CL_INVALID_KERNEL_NAME: - fprintf(stderr, "Invalid kernel name.\n"); - break; - case CL_INVALID_KERNEL_DEFINITION: - fprintf(stderr, "Invalid kernel definition.\n"); - break; - case CL_INVALID_KERNEL: - fprintf(stderr, "Invalid kernel.\n"); - break; - case CL_INVALID_ARG_INDEX: - fprintf(stderr, "Invalid arg index.\n"); - break; - case CL_INVALID_ARG_VALUE: - fprintf(stderr, "Invalid arg value.\n"); - break; - case CL_INVALID_ARG_SIZE: - fprintf(stderr, "Invalid arg size.\n"); - break; - case CL_INVALID_KERNEL_ARGS: - fprintf(stderr, "Invalid kernel args.\n"); - break; - case CL_INVALID_WORK_DIMENSION: - fprintf(stderr, "Invalid work dimension.\n"); - break; - case CL_INVALID_WORK_GROUP_SIZE: - fprintf(stderr, "Invalid work group size.\n"); - break; - case CL_INVALID_WORK_ITEM_SIZE: - fprintf(stderr, "Invalid work item size.\n"); - break; - case CL_INVALID_GLOBAL_OFFSET: - fprintf(stderr, "Invalid global offset.\n"); - break; - case CL_INVALID_EVENT_WAIT_LIST: - fprintf(stderr, "Invalid event wait list.\n"); - break; - case CL_INVALID_EVENT: - fprintf(stderr, "Invalid event.\n"); - break; - case CL_INVALID_OPERATION: - fprintf(stderr, "Invalid operation.\n"); - break; - case CL_INVALID_GL_OBJECT: - fprintf(stderr, "Invalid GL object.\n"); - break; - case CL_INVALID_BUFFER_SIZE: - fprintf(stderr, "Invalid buffer size.\n"); - break; - case CL_INVALID_MIP_LEVEL: - fprintf(stderr, "Invalid mip level.\n"); - break; - case CL_INVALID_GLOBAL_WORK_SIZE: - fprintf(stderr, "Invalid global work size.\n"); - break; - case CL_INVALID_PROPERTY: - fprintf(stderr, "Invalid property.\n"); - break; - case CL_INVALID_IMAGE_DESCRIPTOR: - fprintf(stderr, "Invalid image descriptor.\n"); - break; - case CL_INVALID_COMPILER_OPTIONS: - fprintf(stderr, "Invalid compiler options.\n"); - break; - case CL_INVALID_LINKER_OPTIONS: - fprintf(stderr, "Invalid linker options.\n"); - break; - case CL_INVALID_DEVICE_PARTITION_COUNT: - fprintf(stderr, "Invalid device partition count.\n"); - break; - case CL_INVALID_PIPE_SIZE: - fprintf(stderr, "Invalid pipe size.\n"); - break; - case CL_INVALID_DEVICE_QUEUE: - fprintf(stderr, "Invalid device queue.\n"); - break; - - // NVIDIA specific error. - case -9999: - fprintf(stderr, "NVIDIA invalid read or write buffer.\n"); - break; - - default: - fprintf(stderr, "Unknown error code!\n"); - break; - } -} - -#endif /* HAS_LIBOPENCL */ -/******************************************************************************/ -/* CUDA */ -/******************************************************************************/ -#ifdef HAS_LIBCUDART - -struct CUDAContextT { CUcontext Cuda; }; -struct CUDAKernelT { +struct PollyGPUFunctionT { CUfunction Cuda; CUmodule CudaModule; - const char *BinaryString; + const char *PTXString; }; -struct CUDADevicePtrT { +struct PollyGPUDevicePtrT { CUdeviceptr Cuda; }; @@ -906,10 +57,10 @@ static CuMemAllocFcnTy *CuMemAllocFcnPtr; typedef CUresult CUDAAPI CuLaunchKernelFcnTy( - CUfunction F, unsigned int GridDimX, unsigned int GridDimY, - unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, - unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, - void **KernelParams, void **Extra); + CUfunction f, unsigned int gridDimX, unsigned int gridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, + unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, + void **kernelParams, void **extra); static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); @@ -944,8 +95,8 @@ void **); static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; -typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module, - const void *Image); +typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module, + const void *image); static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, @@ -958,25 +109,25 @@ typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; -typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State, - CUjitInputType Type, void *Data, - size_t Size, const char *Name, - unsigned int NumOptions, - CUjit_option *Options, - void **OptionValues); +typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state, + CUjitInputType type, void *data, + size_t size, const char *name, + unsigned int numOptions, + CUjit_option *options, + void **optionValues); static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; -typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions, - CUjit_option *Options, - void **OptionValues, - CUlinkState *StateOut); +typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions, + CUjit_option *options, + void **optionValues, + CUlinkState *stateOut); static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; -typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut, - size_t *SizeOut); +typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut, + size_t *sizeOut); static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; -typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State); +typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state); static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy(); @@ -986,36 +137,36 @@ typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; -static void *getAPIHandleCUDA(void *Handle, const char *FuncName) { +static void *getAPIHandle(void *Handle, const char *FuncName) { char *Err; void *FuncPtr; dlerror(); FuncPtr = dlsym(Handle, FuncName); if ((Err = dlerror()) != 0) { - fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err); + fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err); return 0; } return FuncPtr; } -static int initialDeviceAPILibrariesCUDA() { +static int initialDeviceAPILibraries() { HandleCuda = dlopen("libcuda.so", RTLD_LAZY); if (!HandleCuda) { - fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + printf("Cannot open library: %s. \n", dlerror()); return 0; } HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); if (!HandleCudaRT) { - fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + printf("Cannot open library: %s. \n", dlerror()); return 0; } return 1; } -static int initialDeviceAPIsCUDA() { - if (initialDeviceAPILibrariesCUDA() == 0) +static int initialDeviceAPIs() { + if (initialDeviceAPILibraries() == 0) return 0; /* Get function pointer to CUDA Driver APIs. @@ -1027,76 +178,77 @@ * as it is valid on POSIX 2008. */ CuLaunchKernelFcnPtr = - (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); + (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel"); CuMemAllocFcnPtr = - (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); + (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); - CuMemFreeFcnPtr = - (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); + CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2"); CuMemcpyDtoHFcnPtr = - (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); + (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2"); CuMemcpyHtoDFcnPtr = - (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); + (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2"); CuModuleUnloadFcnPtr = - (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload"); + (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload"); CuCtxDestroyFcnPtr = - (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy"); + (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy"); - CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit"); + CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit"); CuDeviceGetCountFcnPtr = - (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount"); + (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount"); CuDeviceGetFcnPtr = - (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet"); + (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet"); CuCtxCreateFcnPtr = - (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); + (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2"); - CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( - HandleCuda, "cuModuleLoadDataEx"); + CuModuleLoadDataExFcnPtr = + (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); CuModuleLoadDataFcnPtr = - (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData"); + (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData"); - CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA( + CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( HandleCuda, "cuModuleGetFunction"); CuDeviceComputeCapabilityFcnPtr = - (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA( + (CuDeviceComputeCapabilityFcnTy *)getAPIHandle( HandleCuda, "cuDeviceComputeCapability"); CuDeviceGetNameFcnPtr = - (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName"); + (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); CuLinkAddDataFcnPtr = - (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData"); + (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData"); CuLinkCreateFcnPtr = - (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate"); + (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate"); CuLinkCompleteFcnPtr = - (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete"); + (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete"); CuLinkDestroyFcnPtr = - (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy"); + (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); CuCtxSynchronizeFcnPtr = - (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize"); + (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize"); /* Get function pointer to CUDA Runtime APIs. */ - CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA( + CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( HandleCudaRT, "cudaThreadSynchronize"); return 1; } -static PollyGPUContext *initContextCUDA() { +PollyGPUContext *polly_initContext() { + DebugMode = getenv("POLLY_DEBUG") != 0; + dump_function(); PollyGPUContext *Context; CUdevice Device; @@ -1111,20 +263,20 @@ return CurrentContext; /* Get API handles. */ - if (initialDeviceAPIsCUDA() == 0) { - fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n"); + if (initialDeviceAPIs() == 0) { + fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); exit(-1); } if (CuInitFcnPtr(0) != CUDA_SUCCESS) { - fprintf(stderr, "Initializing the CUDA driver API failed.\n"); + fprintf(stdout, "Initializing the CUDA driver API failed.\n"); exit(-1); } /* Get number of devices that supports CUDA. */ CuDeviceGetCountFcnPtr(&DeviceCount); if (DeviceCount == 0) { - fprintf(stderr, "There is no device supporting CUDA.\n"); + fprintf(stdout, "There is no device supporting CUDA.\n"); exit(-1); } @@ -1138,15 +290,12 @@ /* Create context on the device. */ Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); if (Context == 0) { - fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); + fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); exit(-1); } - Context->Context = malloc(sizeof(CUDAContext)); - if (Context->Context == 0) { - fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n"); - exit(-1); - } - CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device); + CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device); + + CacheMode = getenv("POLLY_NOCACHE") == 0; if (CacheMode) CurrentContext = Context; @@ -1154,24 +303,18 @@ return Context; } -static void freeKernelCUDA(PollyGPUFunction *Kernel) { - dump_function(); - - if (CacheMode) - return; - - if (((CUDAKernel *)Kernel->Kernel)->CudaModule) - CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule); - - if (Kernel->Kernel) - free((CUDAKernel *)Kernel->Kernel); +static void freeKernel(PollyGPUFunction *Kernel) { + if (Kernel->CudaModule) + CuModuleUnloadFcnPtr(Kernel->CudaModule); if (Kernel) free(Kernel); } -static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, - const char *KernelName) { +#define KERNEL_CACHE_SIZE 10 + +PollyGPUFunction *polly_getKernel(const char *PTXBuffer, + const char *KernelName) { dump_function(); static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; @@ -1181,21 +324,16 @@ // We exploit here the property that all Polly-ACC kernels are allocated // as global constants, hence a pointer comparision is sufficient to // determin equality. - if (KernelCache[i] && - ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) { + if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) { debug_print(" -> using cached kernel\n"); return KernelCache[i]; } } PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + if (Function == 0) { - fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); - exit(-1); - } - Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel)); - if (Function->Kernel == 0) { - fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n"); + fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); exit(-1); } @@ -1232,45 +370,43 @@ memset(ErrorLog, 0, sizeof(ErrorLog)); CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); - Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer, - strlen(BinaryBuffer) + 1, 0, 0, 0, 0); + Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer, + strlen(PTXBuffer) + 1, 0, 0, 0, 0); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); + fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); exit(-1); } Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Complete ptx linker step failed.\n"); - fprintf(stderr, "\n%s\n", ErrorLog); + fprintf(stdout, "Complete ptx linker step failed.\n"); + fprintf(stdout, "\n%s\n", ErrorLog); exit(-1); } debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, InfoLog); - Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule), - CuOut); + Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Loading ptx assembly text failed.\n"); + fprintf(stdout, "Loading ptx assembly text failed.\n"); exit(-1); } - Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda), - ((CUDAKernel *)Function->Kernel)->CudaModule, + Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule, KernelName); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Loading kernel function failed.\n"); + fprintf(stdout, "Loading kernel function failed.\n"); exit(-1); } CuLinkDestroyFcnPtr(LState); - ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer; + Function->PTXString = PTXBuffer; if (CacheMode) { if (KernelCache[NextCacheItem]) - freeKernelCUDA(KernelCache[NextCacheItem]); + freeKernel(KernelCache[NextCacheItem]); KernelCache[NextCacheItem] = Function; @@ -1280,37 +416,44 @@ return Function; } -static void synchronizeDeviceCUDA() { +void polly_freeKernel(PollyGPUFunction *Kernel) { dump_function(); - if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { - fprintf(stderr, "Synchronizing device and host memory failed.\n"); - exit(-1); - } + + if (CacheMode) + return; + + freeKernel(Kernel); } -static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { +void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { dump_function(); - CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda; + CUdeviceptr CuDevData = DevData->Cuda; CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); } -static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { +void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { dump_function(); - if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda, - MemSize) != CUDA_SUCCESS) { - fprintf(stderr, "Copying results from device to host memory failed.\n"); + if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { + fprintf(stdout, "Copying results from device to host memory failed.\n"); + exit(-1); + } +} +void polly_synchronizeDevice() { + dump_function(); + if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { + fprintf(stdout, "Synchronizing device and host memory failed.\n"); exit(-1); } } -static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { dump_function(); unsigned GridDimZ = 1; @@ -1319,290 +462,45 @@ void **Extra = 0; CUresult Res; - Res = - CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX, - GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ, - SharedMemBytes, Stream, Parameters, Extra); + Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ, + BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes, + Stream, Parameters, Extra); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Launching CUDA kernel failed.\n"); + fprintf(stdout, "Launching CUDA kernel failed.\n"); exit(-1); } } -static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { +void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { dump_function(); - CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; - CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); - free(DevPtr); + CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda); free(Allocation); } -static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) { +PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { dump_function(); PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); + if (DevData == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); - exit(-1); - } - DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr)); - if (DevData->DevicePtr == 0) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); exit(-1); } - CUresult Res = - CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize); + CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize); if (Res != CUDA_SUCCESS) { - fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); exit(-1); } return DevData; } -static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) { - dump_function(); - - CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; - return (void *)DevPtr->Cuda; -} - -static void freeContextCUDA(PollyGPUContext *Context) { - dump_function(); - - CUDAContext *Ctx = (CUDAContext *)Context->Context; - if (Ctx->Cuda) { - CuCtxDestroyFcnPtr(Ctx->Cuda); - free(Ctx); - free(Context); - } - - dlclose(HandleCuda); - dlclose(HandleCudaRT); -} - -#endif /* HAS_LIBCUDART */ -/******************************************************************************/ -/* API */ -/******************************************************************************/ - -PollyGPUContext *polly_initContext() { - DebugMode = getenv("POLLY_DEBUG") != 0; - CacheMode = getenv("POLLY_NOCACHE") == 0; - - dump_function(); - - PollyGPUContext *Context; - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - Context = initContextCUDA(); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - Context = initContextCL(); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } - - return Context; -} - -void polly_freeKernel(PollyGPUFunction *Kernel) { - dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - freeKernelCUDA(Kernel); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - freeKernelCL(Kernel); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } -} - -PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, - const char *KernelName) { - dump_function(); - - PollyGPUFunction *Function; - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - Function = getKernelCUDA(BinaryBuffer, KernelName); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - Function = getKernelCL(BinaryBuffer, KernelName); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } - - return Function; -} - -void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { - dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - copyFromHostToDeviceCUDA(HostData, DevData, MemSize); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - copyFromHostToDeviceCL(HostData, DevData, MemSize); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } -} - -void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { - dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - copyFromDeviceToHostCUDA(DevData, HostData, MemSize); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - copyFromDeviceToHostCL(DevData, HostData, MemSize); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } -} - -void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { - dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, - BlockDimZ, Parameters); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, - Parameters); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } -} - -void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { - dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - freeDeviceMemoryCUDA(Allocation); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - freeDeviceMemoryCL(Allocation); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } -} - -PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { - dump_function(); - - PollyGPUDevicePtr *DevData; - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - DevData = allocateMemoryForDeviceCUDA(MemSize); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - DevData = allocateMemoryForDeviceCL(MemSize); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } - - return DevData; -} - void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { dump_function(); - void *DevPtr; - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - DevPtr = getDevicePtrCUDA(Allocation); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - DevPtr = getDevicePtrCL(Allocation); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } - - return DevPtr; -} - -void polly_synchronizeDevice() { - dump_function(); - - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - synchronizeDeviceCUDA(); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - synchronizeDeviceCL(); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); - } + return (void *)Allocation->Cuda; } void polly_freeContext(PollyGPUContext *Context) { @@ -1611,40 +509,11 @@ if (CacheMode) return; - switch (Runtime) { -#ifdef HAS_LIBCUDART - case RUNTIME_CUDA: - freeContextCUDA(Context); - break; -#endif /* HAS_LIBCUDART */ -#ifdef HAS_LIBOPENCL - case RUNTIME_CL: - freeContextCL(Context); - break; -#endif /* HAS_LIBOPENCL */ - default: - err_runtime(); + if (Context->Cuda) { + CuCtxDestroyFcnPtr(Context->Cuda); + free(Context); } -} - -/* Initialize GPUJIT with CUDA as runtime library. */ -PollyGPUContext *polly_initContextCUDA() { -#ifdef HAS_LIBCUDART - Runtime = RUNTIME_CUDA; - return polly_initContext(); -#else - fprintf(stderr, "GPU Runtime was built without CUDA support.\n"); - exit(-1); -#endif /* HAS_LIBCUDART */ -} -/* Initialize GPUJIT with OpenCL as runtime library. */ -PollyGPUContext *polly_initContextCL() { -#ifdef HAS_LIBOPENCL - Runtime = RUNTIME_CL; - return polly_initContext(); -#else - fprintf(stderr, "GPU Runtime was built without OpenCL support.\n"); - exit(-1); -#endif /* HAS_LIBOPENCL */ + dlclose(HandleCuda); + dlclose(HandleCudaRT); }