diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt --- a/polly/CMakeLists.txt +++ b/polly/CMakeLists.txt @@ -85,6 +85,31 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF) +set(GPU_CODEGEN FALSE) +if (POLLY_ENABLE_GPGPU_CODEGEN) + # Do not require CUDA/OpenCL, as GPU code generation test cases can be run + # without a CUDA/OpenCL library. + if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) + FIND_PACKAGE(CUDA) + FIND_PACKAGE(OpenCL) + set(GPU_CODEGEN TRUE) + else() + message(WARNING "The LLVM NVPTX target is required for GPU code generation") + endif() +endif(POLLY_ENABLE_GPGPU_CODEGEN) + + +# Support GPGPU code generation if the library is available. +if (CUDA_FOUND) + add_definitions(-DHAS_LIBCUDART) + INCLUDE_DIRECTORIES( ${CUDA_INCLUDE_DIRS} ) +endif(CUDA_FOUND) +if (OpenCL_FOUND) + add_definitions(-DHAS_LIBOPENCL) + INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) +endif(OpenCL_FOUND) + option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) find_package(ISL MODULE REQUIRED) @@ -130,6 +155,7 @@ if (POLLY_GTEST_AVAIL) add_subdirectory(unittests) endif () +add_subdirectory(tools) add_subdirectory(cmake) # TODO: docs. diff --git a/polly/CREDITS.txt b/polly/CREDITS.txt --- a/polly/CREDITS.txt +++ b/polly/CREDITS.txt @@ -18,6 +18,11 @@ W: http://www.grosser.es D: Co-founder, design of the overall architecture +N: Yabin Hu +E: yabin.hwu@gmail.com +D: GPGPU code generation +D: Google Summer of Code student 2012, 2014 + N: Andreas Simbuerger E: simbuerg@fim.uni-passau.de W: http://www.infosun.fim.uni-passau.de/cl/staff/simbuerger/ diff --git a/polly/cmake/CMakeLists.txt b/polly/cmake/CMakeLists.txt --- a/polly/cmake/CMakeLists.txt +++ b/polly/cmake/CMakeLists.txt @@ -27,6 +27,9 @@ # LLVMPolly is a dummy target on Win or if PIC code is disabled. list(APPEND POLLY_CONFIG_EXPORTED_TARGETS LLVMPolly) endif() +if (POLLY_ENABLE_GPGPU_CODEGEN) + list(APPEND POLLY_CONFIG_EXPORTED_TARGETS PollyPPCG) +endif() # Get the target type for every exported target foreach(tgt IN LISTS POLLY_CONFIG_EXPORTED_TARGETS) diff --git a/polly/cmake/PollyConfig.cmake.in b/polly/cmake/PollyConfig.cmake.in --- a/polly/cmake/PollyConfig.cmake.in +++ b/polly/cmake/PollyConfig.cmake.in @@ -8,6 +8,7 @@ set(Polly_CMAKE_DIR ${CMAKE_CURRENT_LIST_DIR}) set(Polly_BUNDLED_ISL @POLLY_BUNDLED_ISL@) +set(Polly_ENABLE_GPGPU_CODEGEN @POLLY_ENABLE_GPGPU_CODEGEN@) set(Polly_DEFINITIONS ${LLVM_DEFINITIONS}) set(Polly_INCLUDE_DIRS @POLLY_CONFIG_INCLUDE_DIRS@ ${LLVM_INCLUDE_DIRS}) @@ -18,9 +19,17 @@ # Imported Targets: @ISL_CONFIG_CODE@ +if (Polly_ENABLE_GPGPU_CODEGEN AND NOT TARGET PollyPPCG) + add_library(PollyPPCG @POLLY_CONFIG_TARGET_PollyPPCG_TYPE@ IMPORTED) + set_property(TARGET PollyPPCG PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@) +endif() + if (NOT TARGET Polly) add_library(Polly @POLLY_CONFIG_TARGET_Polly_TYPE@ IMPORTED) set_property(TARGET Polly PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@) + if (Polly_ENABLE_GPGPU_CODEGEN) + set_property(TARGET Polly APPEND PROPERTY INTERFACE_LINK_LIBRARIES PollyPPCG) + endif() endif() if (NOT TARGET LLVMPolly) diff --git a/polly/include/polly/CodeGen/PPCGCodeGeneration.h b/polly/include/polly/CodeGen/PPCGCodeGeneration.h new file mode 100644 --- /dev/null +++ b/polly/include/polly/CodeGen/PPCGCodeGeneration.h @@ -0,0 +1,33 @@ +//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Take a scop created by ScopInfo and map it to GPU code using the ppcg +// GPU mapping strategy. +// +//===----------------------------------------------------------------------===// + +#ifndef POLLY_PPCGCODEGENERATION_H +#define POLLY_PPCGCODEGENERATION_H + +/// The GPU Architecture to target. +enum GPUArch { NVPTX64, SPIR32, SPIR64 }; + +/// The GPU Runtime implementation to use. +enum GPURuntime { CUDA, OpenCL }; + +namespace polly { +extern bool PollyManagedMemory; + +/// Use for pass instantiation defaults. +/// @{ +extern GPURuntime GPURuntimeChoice; +extern GPUArch GPUArchChoice; +/// @} +} // namespace polly + +#endif // POLLY_PPCGCODEGENERATION_H diff --git a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h --- a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h +++ b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h @@ -30,20 +30,24 @@ struct RuntimeDebugBuilder { /// Generate a constant string into the builder's llvm::Module which can be - /// passed to createCPUPrinter(). + /// passed to createGPUPrinter() or createGPUPrinter(). /// /// @param Builder The builder used to emit the printer calls. /// @param Str The string to be printed. /// @return A global containing @p Str. static llvm::Value *getPrintableString(PollyIRBuilder &Builder, - llvm::StringRef Str); + llvm::StringRef Str) { + // TODO: Get rid of magic number 4. It it NVPTX's constant address space and + // works on X86 (CPU) only because its backend ignores the address space. + return Builder.CreateGlobalStringPtr(Str, "", 4); + } /// Return whether an llvm::Value of the type @p Ty is printable for /// debugging. /// - /// That is, whether such a value can be passed to createGPUPrinter() - /// to be dumped as runtime. If false is returned, those + /// That is, whether such a value can be passed to createGPUPrinter() or + /// createGPUPrinter() to be dumped as runtime. If false is returned, those /// functions will fail. static bool isPrintable(llvm::Type *Ty); @@ -60,41 +64,62 @@ template static void createCPUPrinter(PollyIRBuilder &Builder, Args... args) { std::vector Vector; - createPrinter(Builder, Vector, args...); + createPrinter(Builder, /* CPU */ false, Vector, args...); + } + + /// Print a set of LLVM-IR Values or StringRefs on an NVIDIA GPU. + /// + /// This function emits a call to vprintf that will print the given + /// arguments from within a kernel thread. It is useful for debugging + /// CUDA program kernels. All arguments given in this list will be + /// automatically concatenated and the resulting string will be printed + /// atomically. We also support ArrayRef arguments, which can be used to + /// provide for example a list of thread-id values. + /// + /// @param Builder The builder used to emit the printer calls. + /// @param Args The list of values to print. + template + static void createGPUPrinter(PollyIRBuilder &Builder, Args... args) { + std::vector Vector; + createPrinter(Builder, /* GPU */ true, Vector, args...); } private: /// Handle Values. template - static void createPrinter(PollyIRBuilder &Builder, + static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, std::vector &Values, llvm::Value *Value, Args... args) { Values.push_back(Value); - createPrinter(Builder, Values, args...); + createPrinter(Builder, UseGPU, Values, args...); } /// Handle StringRefs. template - static void createPrinter(PollyIRBuilder &Builder, + static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, std::vector &Values, llvm::StringRef String, Args... args) { Values.push_back(getPrintableString(Builder, String)); - createPrinter(Builder, Values, args...); + createPrinter(Builder, UseGPU, Values, args...); } /// Handle ArrayRefs. template - static void createPrinter(PollyIRBuilder &Builder, + static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, std::vector &Values, llvm::ArrayRef Array, Args... args) { Values.insert(Values.end(), Array.begin(), Array.end()); - createPrinter(Builder, Values, args...); + createPrinter(Builder, UseGPU, Values, args...); } /// Print a list of Values. - static void createPrinter(PollyIRBuilder &Builder, + static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, llvm::ArrayRef Values); + /// Print a list of Values on a GPU. + static void createGPUPrinterT(PollyIRBuilder &Builder, + llvm::ArrayRef Values); + /// Print a list of Values on a CPU. static void createCPUPrinterT(PollyIRBuilder &Builder, llvm::ArrayRef Values); @@ -120,6 +145,22 @@ /// /// @parma Builder The builder used to insert the code. static void createFlush(PollyIRBuilder &Builder); + + /// Get (and possibly insert) a NVIDIA address space cast call. + static llvm::Function *getAddressSpaceCast(PollyIRBuilder &Builder, + unsigned Src, unsigned Dst, + unsigned SrcBits = 8, + unsigned DstBits = 8); + + /// Get identifiers that describe the currently executed GPU thread. + /// + /// The result will be a vector that if passed to the GPU printer will result + /// into a string (initialized to values corresponding to the printing + /// thread): + /// + /// "> block-id: bidx bid1y bidz | thread-id: tidx tidy tidz " + static std::vector + getGPUThreadIdentifiers(PollyIRBuilder &Builder); }; } // namespace polly diff --git a/polly/include/polly/Config/config.h.cmake b/polly/include/polly/Config/config.h.cmake --- a/polly/include/polly/Config/config.h.cmake +++ b/polly/include/polly/Config/config.h.cmake @@ -12,4 +12,7 @@ #ifndef POLLY_CONFIG_H #define POLLY_CONFIG_H +#cmakedefine CUDA_FOUND +#cmakedefine GPU_CODEGEN + #endif diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h --- a/polly/include/polly/LinkAllPasses.h +++ b/polly/include/polly/LinkAllPasses.h @@ -14,6 +14,7 @@ #ifndef POLLY_LINKALLPASSES_H #define POLLY_LINKALLPASSES_H +#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/Config/config.h" #include "polly/Support/DumpFunctionPass.h" #include "polly/Support/DumpModulePass.h" @@ -53,6 +54,14 @@ llvm::Pass *createIslAstInfoWrapperPassPass(); llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS); llvm::Pass *createCodeGenerationPass(); +#ifdef GPU_CODEGEN +llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64, + GPURuntime Runtime = GPURuntime::CUDA); + +llvm::Pass * +createManagedMemoryRewritePassPass(GPUArch Arch = GPUArch::NVPTX64, + GPURuntime Runtime = GPURuntime::CUDA); +#endif llvm::Pass *createIslScheduleOptimizerWrapperPass(); llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS); llvm::Pass *createFlattenSchedulePass(); @@ -104,6 +113,10 @@ polly::createIslAstInfoWrapperPassPass(); polly::createIslAstInfoPrinterLegacyPass(llvm::outs()); polly::createCodeGenerationPass(); +#ifdef GPU_CODEGEN + polly::createPPCGCodeGenerationPass(); + polly::createManagedMemoryRewritePassPass(); +#endif polly::createIslScheduleOptimizerWrapperPass(); polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs()); polly::createMaximalStaticExpansionPass(); @@ -143,6 +156,10 @@ void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &); void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &); void initializeCodeGenerationPass(llvm::PassRegistry &); +#ifdef GPU_CODEGEN +void initializePPCGCodeGenerationPass(llvm::PassRegistry &); +void initializeManagedMemoryRewritePassPass(llvm::PassRegistry &); +#endif void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &); void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &); void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &); diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -1684,6 +1684,9 @@ /// Number of copy statements. unsigned CopyStmtsNum = 0; + /// Flag to indicate if the Scop is to be skipped. + bool SkipScop = false; + using StmtSet = std::list; /// The statements in this Scop. @@ -2141,6 +2144,12 @@ /// Check if the SCoP has been optimized by the scheduler. bool isOptimized() const { return IsOptimized; } + /// Mark the SCoP to be skipped by ScopPass passes. + void markAsToBeSkipped() { SkipScop = true; } + + /// Check if the SCoP is to be skipped by ScopPass passes. + bool isToBeSkipped() const { return SkipScop; } + /// Return the ID of the Scop int getID() const { return ID; } diff --git a/polly/include/polly/Support/LinkGPURuntime.h b/polly/include/polly/Support/LinkGPURuntime.h new file mode 100644 --- /dev/null +++ b/polly/include/polly/Support/LinkGPURuntime.h @@ -0,0 +1,42 @@ +//===- Support/LinkGPURuntime.h -- Headerfile to help force-link GPURuntime =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header helps pull in libGPURuntime.so +// +//===----------------------------------------------------------------------===// +#ifndef POLLY_LINK_GPURUNTIME +#define POLLY_LINK_GPURUNTIME + +extern "C" { +#include "GPURuntime/GPUJIT.h" +} + +namespace polly { +struct ForceGPURuntimeLinking { + ForceGPURuntimeLinking() { + if (std::getenv("bar") != (char *)-1) + return; + // We must reference GPURuntime in such a way that compilers will not + // delete it all as dead code, even with whole program optimization, + // yet is effectively a NO-OP. As the compiler isn't smart enough + // to know that getenv() never returns -1, this will do the job. + polly_initContextCL(); + polly_initContextCUDA(); + polly_getKernel(nullptr, nullptr); + polly_freeKernel(nullptr); + polly_copyFromHostToDevice(nullptr, nullptr, 0); + polly_copyFromDeviceToHost(nullptr, nullptr, 0); + polly_synchronizeDevice(); + polly_launchKernel(nullptr, 0, 0, 0, 0, 0, nullptr); + polly_freeDeviceMemory(nullptr); + polly_freeContext(nullptr); + polly_synchronizeDevice(); + } +} structure; +} // namespace polly +#endif diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt --- a/polly/lib/CMakeLists.txt +++ b/polly/lib/CMakeLists.txt @@ -6,6 +6,13 @@ CodeGen/IslNodeBuilder.cpp CodeGen/CodeGeneration.cpp) +if (GPU_CODEGEN) + set (GPGPU_CODEGEN_FILES + CodeGen/PPCGCodeGeneration.cpp + CodeGen/ManagedMemoryRewrite.cpp + ) +endif (GPU_CODEGEN) + # Compile ISL into a separate library. add_subdirectory(External) @@ -37,6 +44,12 @@ Vectorize ) +# Polly-ACC requires the NVPTX backend to work. Ask LLVM about its libraries. +if (GPU_CODEGEN) + # This call emits an error if they NVPTX backend is not enable. + list(APPEND POLLY_COMPONENTS NVPTX) +endif () + # Use an object-library to add the same files to multiple libs without requiring # the sources them to be recompiled for each of them. add_llvm_pass_plugin(Polly @@ -61,6 +74,7 @@ CodeGen/RuntimeDebugBuilder.cpp CodeGen/CodegenCleanup.cpp CodeGen/PerfMonitor.cpp + ${GPGPU_CODEGEN_FILES} Exchange/JSONExporter.cpp Support/GICHelper.cpp Support/SCEVAffinator.cpp @@ -114,6 +128,16 @@ ${ISL_TARGET} ) +# Additional dependencies for Polly-ACC. +if (GPU_CODEGEN) + target_link_libraries(Polly PUBLIC PollyPPCG) +endif () + +if (NOT LLVM_LINK_LLVM_DYLIB AND NOT LLVM_POLLY_LINK_INTO_TOOLS) + # Polly-ACC requires the NVPTX target to be present in the executable it is linked to + set_property(TARGET bugpoint APPEND PROPERTY LINK_LIBRARIES LLVMTarget) +endif () + # Create a loadable module Polly.so that can be loaded using # LLVM's/clang's "-load" option. if (WIN32 OR NOT LLVM_ENABLE_PIC) @@ -127,6 +151,19 @@ $ ) + # Only add the dependencies that are not part of LLVM. The latter are assumed + # to be already available in the address space the module is loaded into. + # Adding them once more would have the effect that both copies try to register + # the same command line options, to which LLVM reacts with an error. + # If Polly-ACC is enabled, the NVPTX target is also expected to reside in the + # hosts. This is not the case for bugpoint. Use LLVM_POLLY_LINK_INTO_TOOLS=ON + # instead which will automatically resolve the additional dependencies by + # Polly. + target_link_libraries(LLVMPolly PUBLIC ${ISL_TARGET}) + if (GPU_CODEGEN) + target_link_libraries(LLVMPolly PUBLIC PollyPPCG) + endif () + set_target_properties(LLVMPolly PROPERTIES LINKER_LANGUAGE CXX diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp --- a/polly/lib/CodeGen/BlockGenerators.cpp +++ b/polly/lib/CodeGen/BlockGenerators.cpp @@ -238,8 +238,14 @@ Builder.Insert(NewInst); BBMap[Inst] = NewInst; - assert(NewInst->getModule() == Inst->getModule() && - "Expecting instructions to be in the same module"); + // When copying the instruction onto the Module meant for the GPU, + // debug metadata attached to an instruction causes all related + // metadata to be pulled into the Module. This includes the DICompileUnit, + // which will not be listed in llvm.dbg.cu of the Module since the Module + // doesn't contain one. This fails the verification of the Module and the + // subsequent generation of the ASM string. + if (NewInst->getModule() != Inst->getModule()) + NewInst->setDebugLoc(llvm::DebugLoc()); if (!NewInst->getType()->isVoidTy()) NewInst->setName("p_" + Inst->getName()); diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp --- a/polly/lib/CodeGen/CodeGeneration.cpp +++ b/polly/lib/CodeGen/CodeGeneration.cpp @@ -323,6 +323,10 @@ /// Generate LLVM-IR for the SCoP @p S. bool runOnScop(Scop &S) override { + // Skip SCoPs in case they're already code-generated by PPCGCodeGeneration. + if (S.isToBeSkipped()) + return false; + AI = &getAnalysis().getAI(); LI = &getAnalysis().getLoopInfo(); DT = &getAnalysis().getDomTree(); diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp --- a/polly/lib/CodeGen/IslAst.cpp +++ b/polly/lib/CodeGen/IslAst.cpp @@ -638,6 +638,10 @@ static std::unique_ptr runIslAst( Scop &Scop, function_ref GetDeps) { + // Skip SCoPs in case they're already handled by PPCGCodeGeneration. + if (Scop.isToBeSkipped()) + return {}; + ScopsProcessed++; const Dependences &D = GetDeps(Dependences::AL_Statement); diff --git a/polly/lib/CodeGen/ManagedMemoryRewrite.cpp b/polly/lib/CodeGen/ManagedMemoryRewrite.cpp new file mode 100644 --- /dev/null +++ b/polly/lib/CodeGen/ManagedMemoryRewrite.cpp @@ -0,0 +1,427 @@ +//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Take a module and rewrite: +// 1. `malloc` -> `polly_mallocManaged` +// 2. `free` -> `polly_freeManaged` +// 3. global arrays with initializers -> global arrays that are initialized +// with a constructor call to +// `polly_mallocManaged`. +// +//===----------------------------------------------------------------------===// + +#include "polly/CodeGen/IRBuilder.h" +#include "polly/CodeGen/PPCGCodeGeneration.h" +#include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" +#include "polly/Options.h" +#include "polly/ScopDetection.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; +using namespace polly; + +static cl::opt RewriteAllocas( + "polly-acc-rewrite-allocas", + cl::desc( + "Ask the managed memory rewriter to also rewrite alloca instructions"), + cl::Hidden, cl::cat(PollyCategory)); + +static cl::opt IgnoreLinkageForGlobals( + "polly-acc-rewrite-ignore-linkage-for-globals", + cl::desc( + "By default, we only rewrite globals with internal linkage. This flag " + "enables rewriting of globals regardless of linkage"), + cl::Hidden, cl::cat(PollyCategory)); + +#define DEBUG_TYPE "polly-acc-rewrite-managed-memory" +namespace { + +static llvm::Function *getOrCreatePollyMallocManaged(Module &M) { + const char *Name = "polly_mallocManaged"; + Function *F = M.getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + PollyIRBuilder Builder(M.getContext()); + // TODO: How do I get `size_t`? I assume from DataLayout? + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), + {Builder.getInt64Ty()}, false); + F = Function::Create(Ty, Linkage, Name, &M); + } + + return F; +} + +static llvm::Function *getOrCreatePollyFreeManaged(Module &M) { + const char *Name = "polly_freeManaged"; + Function *F = M.getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + PollyIRBuilder Builder(M.getContext()); + // TODO: How do I get `size_t`? I assume from DataLayout? + FunctionType *Ty = + FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false); + F = Function::Create(Ty, Linkage, Name, &M); + } + + return F; +} + +// Expand a constant expression `Cur`, which is used at instruction `Parent` +// at index `index`. +// Since a constant expression can expand to multiple instructions, store all +// the expands into a set called `Expands`. +// Note that this goes inorder on the constant expression tree. +// A * ((B * D) + C) +// will be processed with first A, then B * D, then B, then D, and then C. +// Though ConstantExprs are not treated as "trees" but as DAGs, since you can +// have something like this: +// * +// / \ +// \ / +// (D) +// +// For the purposes of this expansion, we expand the two occurences of D +// separately. Therefore, we expand the DAG into the tree: +// * +// / \ +// D D +// TODO: We don't _have_to do this, but this is the simplest solution. +// We can write a solution that keeps track of which constants have been +// already expanded. +static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder, + Instruction *Parent, int index, + SmallPtrSet &Expands) { + assert(Cur && "invalid constant expression passed"); + Instruction *I = Cur->getAsInstruction(); + assert(I && "unable to convert ConstantExpr to Instruction"); + + LLVM_DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur + << ") in Instruction: (" << *I << ")\n";); + + // Invalidate `Cur` so that no one after this point uses `Cur`. Rather, + // they should mutate `I`. + Cur = nullptr; + + Expands.insert(I); + Parent->setOperand(index, I); + + // The things that `Parent` uses (its operands) should be created + // before `Parent`. + Builder.SetInsertPoint(Parent); + Builder.Insert(I); + + for (unsigned i = 0; i < I->getNumOperands(); i++) { + Value *Op = I->getOperand(i); + assert(isa(Op) && "constant must have a constant operand"); + + if (ConstantExpr *CExprOp = dyn_cast(Op)) + expandConstantExpr(CExprOp, Builder, I, i, Expands); + } +} + +// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite +// `ConstantExpr`s that are used in the `Inst`. +// Note that `replaceAllUsesWith` is insufficient for this purpose because it +// does not rewrite values in `ConstantExpr`s. +static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal, + PollyIRBuilder &Builder) { + + // This contains a set of instructions in which OldVal must be replaced. + // We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s + // from `Inst`s arguments. + // We need to go through this process because `replaceAllUsesWith` does not + // actually edit `ConstantExpr`s. + SmallPtrSet InstsToVisit = {Inst}; + + // Expand all `ConstantExpr`s and place it in `InstsToVisit`. + for (unsigned i = 0; i < Inst->getNumOperands(); i++) { + Value *Operand = Inst->getOperand(i); + if (ConstantExpr *ValueConstExpr = dyn_cast(Operand)) + expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit); + } + + // Now visit each instruction and use `replaceUsesOfWith`. We know that + // will work because `I` cannot have any `ConstantExpr` within it. + for (Instruction *I : InstsToVisit) + I->replaceUsesOfWith(OldVal, NewVal); +} + +// Given a value `Current`, return all Instructions that may contain `Current` +// in an expression. +// We need this auxiliary function, because if we have a +// `Constant` that is a user of `V`, we need to recurse into the +// `Constant`s uses to gather the root instruction. +static void getInstructionUsersOfValue(Value *V, + SmallVector &Owners) { + if (auto *I = dyn_cast(V)) { + Owners.push_back(I); + } else { + // Anything that is a `User` must be a constant or an instruction. + auto *C = cast(V); + for (Use &CUse : C->uses()) + getInstructionUsersOfValue(CUse.getUser(), Owners); + } +} + +static void +replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array, + SmallPtrSet &ReplacedGlobals) { + // We only want arrays. + ArrayType *ArrayTy = dyn_cast(Array.getValueType()); + if (!ArrayTy) + return; + Type *ElemTy = ArrayTy->getElementType(); + PointerType *ElemPtrTy = ElemTy->getPointerTo(); + + // We only wish to replace arrays that are visible in the module they + // inhabit. Otherwise, our type edit from [T] to T* would be illegal across + // modules. + const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() || + Array.hasInternalLinkage() || + IgnoreLinkageForGlobals; + if (!OnlyVisibleInsideModule) { + LLVM_DEBUG( + dbgs() << "Not rewriting (" << Array + << ") to managed memory " + "because it could be visible externally. To force rewrite, " + "use -polly-acc-rewrite-ignore-linkage-for-globals.\n"); + return; + } + + if (!Array.hasInitializer() || + !isa(Array.getInitializer())) { + LLVM_DEBUG(dbgs() << "Not rewriting (" << Array + << ") to managed memory " + "because it has an initializer which is " + "not a zeroinitializer.\n"); + return; + } + + // At this point, we have committed to replacing this array. + ReplacedGlobals.insert(&Array); + + std::string NewName = Array.getName().str(); + NewName += ".toptr"; + GlobalVariable *ReplacementToArr = + cast(M.getOrInsertGlobal(NewName, ElemPtrTy)); + ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy)); + + Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M); + std::string FnName = Array.getName().str(); + FnName += ".constructor"; + PollyIRBuilder Builder(M.getContext()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); + const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + Function *F = Function::Create(Ty, Linkage, FnName, &M); + BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F); + Builder.SetInsertPoint(Start); + + const uint64_t ArraySizeInt = DL.getTypeAllocSize(ArrayTy); + Value *ArraySize = Builder.getInt64(ArraySizeInt); + ArraySize->setName("array.size"); + + Value *AllocatedMemRaw = + Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw"); + Value *AllocatedMemTyped = + Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed"); + Builder.CreateStore(AllocatedMemTyped, ReplacementToArr); + Builder.CreateRetVoid(); + + const int Priority = 0; + appendToGlobalCtors(M, F, Priority, ReplacementToArr); + + SmallVector ArrayUserInstructions; + // Get all instructions that use array. We need to do this weird thing + // because `Constant`s that contain this array neeed to be expanded into + // instructions so that we can replace their parameters. `Constant`s cannot + // be edited easily, so we choose to convert all `Constant`s to + // `Instruction`s and handle all of the uses of `Array` uniformly. + for (Use &ArrayUse : Array.uses()) + getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions); + + for (Instruction *UserOfArrayInst : ArrayUserInstructions) { + + Builder.SetInsertPoint(UserOfArrayInst); + // ** -> * + Value *ArrPtrLoaded = + Builder.CreateLoad(ElemPtrTy, ReplacementToArr, "arrptr.load"); + // * -> [ty]* + Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast( + ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast"); + rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder); + } +} + +// We return all `allocas` that may need to be converted to a call to +// cudaMallocManaged. +static void getAllocasToBeManaged(Function &F, + SmallSet &Allocas) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + auto *Alloca = dyn_cast(&I); + if (!Alloca) + continue; + LLVM_DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: "); + + if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false, + /* StoreCaptures */ true)) { + Allocas.insert(Alloca); + LLVM_DEBUG(dbgs() << "YES (captured).\n"); + } else { + LLVM_DEBUG(dbgs() << "NO (not captured).\n"); + } + } + } +} + +static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca, + const DataLayout &DL) { + LLVM_DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n"); + Module *M = Alloca->getModule(); + assert(M && "Alloca does not have a module"); + + PollyIRBuilder Builder(M->getContext()); + Builder.SetInsertPoint(Alloca); + + Function *MallocManagedFn = + getOrCreatePollyMallocManaged(*Alloca->getModule()); + const uint64_t Size = DL.getTypeAllocSize(Alloca->getAllocatedType()); + Value *SizeVal = Builder.getInt64(Size); + Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal}); + Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType()); + + Function *F = Alloca->getFunction(); + assert(F && "Alloca has invalid function"); + + Bitcasted->takeName(Alloca); + Alloca->replaceAllUsesWith(Bitcasted); + Alloca->eraseFromParent(); + + for (BasicBlock &BB : *F) { + ReturnInst *Return = dyn_cast(BB.getTerminator()); + if (!Return) + continue; + Builder.SetInsertPoint(Return); + + Function *FreeManagedFn = getOrCreatePollyFreeManaged(*M); + Builder.CreateCall(FreeManagedFn, {RawManagedMem}); + } +} + +// Replace all uses of `Old` with `New`, even inside `ConstantExpr`. +// +// `replaceAllUsesWith` does replace values in `ConstantExpr`. This function +// actually does replace it in `ConstantExpr`. The caveat is that if there is +// a use that is *outside* a function (say, at global declarations), we fail. +// So, this is meant to be used on values which we know will only be used +// within functions. +// +// This process works by looking through the uses of `Old`. If it finds a +// `ConstantExpr`, it recursively looks for the owning instruction. +// Then, it expands all the `ConstantExpr` to instructions and replaces +// `Old` with `New` in the expanded instructions. +static void replaceAllUsesAndConstantUses(Value *Old, Value *New, + PollyIRBuilder &Builder) { + SmallVector UserInstructions; + // Get all instructions that use array. We need to do this weird thing + // because `Constant`s that contain this array neeed to be expanded into + // instructions so that we can replace their parameters. `Constant`s cannot + // be edited easily, so we choose to convert all `Constant`s to + // `Instruction`s and handle all of the uses of `Array` uniformly. + for (Use &ArrayUse : Old->uses()) + getInstructionUsersOfValue(ArrayUse.getUser(), UserInstructions); + + for (Instruction *I : UserInstructions) + rewriteOldValToNew(I, Old, New, Builder); +} + +class ManagedMemoryRewritePass final : public ModulePass { +public: + static char ID; + GPUArch Architecture; + GPURuntime Runtime; + + ManagedMemoryRewritePass() : ModulePass(ID) {} + bool runOnModule(Module &M) override { + const DataLayout &DL = M.getDataLayout(); + + Function *Malloc = M.getFunction("malloc"); + + if (Malloc) { + PollyIRBuilder Builder(M.getContext()); + Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M); + assert(PollyMallocManaged && "unable to create polly_mallocManaged"); + + replaceAllUsesAndConstantUses(Malloc, PollyMallocManaged, Builder); + Malloc->eraseFromParent(); + } + + Function *Free = M.getFunction("free"); + + if (Free) { + PollyIRBuilder Builder(M.getContext()); + Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M); + assert(PollyFreeManaged && "unable to create polly_freeManaged"); + + replaceAllUsesAndConstantUses(Free, PollyFreeManaged, Builder); + Free->eraseFromParent(); + } + + SmallPtrSet GlobalsToErase; + for (GlobalVariable &Global : M.globals()) + replaceGlobalArray(M, DL, Global, GlobalsToErase); + for (GlobalVariable *G : GlobalsToErase) + G->eraseFromParent(); + + // Rewrite allocas to cudaMallocs if we are asked to do so. + if (RewriteAllocas) { + SmallSet AllocasToBeManaged; + for (Function &F : M.functions()) + getAllocasToBeManaged(F, AllocasToBeManaged); + + for (AllocaInst *Alloca : AllocasToBeManaged) + rewriteAllocaAsManagedMemory(Alloca, DL); + } + + return true; + } +}; +} // namespace +char ManagedMemoryRewritePass::ID = 42; + +Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch, + GPURuntime Runtime) { + ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass(); + pass->Runtime = Runtime; + pass->Architecture = Arch; + return pass; +} + +INITIALIZE_PASS_BEGIN( + ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory", + "Polly - Rewrite all allocations in heap & data section to managed memory", + false, false) +INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); +INITIALIZE_PASS_END( + ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory", + "Polly - Rewrite all allocations in heap & data section to managed memory", + false, false) diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp new file mode 100644 --- /dev/null +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -0,0 +1,3657 @@ +//===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Take a scop created by ScopInfo and map it to GPU code using the ppcg +// GPU mapping strategy. +// +//===----------------------------------------------------------------------===// + +#include "polly/CodeGen/PPCGCodeGeneration.h" +#include "polly/CodeGen/CodeGeneration.h" +#include "polly/CodeGen/IslAst.h" +#include "polly/CodeGen/IslNodeBuilder.h" +#include "polly/CodeGen/PerfMonitor.h" +#include "polly/CodeGen/Utils.h" +#include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" +#include "polly/Options.h" +#include "polly/ScopDetection.h" +#include "polly/ScopInfo.h" +#include "polly/Support/ISLTools.h" +#include "polly/Support/SCEVValidator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/InitializePasses.h" +#include "llvm/Linker/Linker.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "isl/union_map.h" +#include + +extern "C" { +#include "ppcg/cuda.h" +#include "ppcg/gpu.h" +#include "ppcg/ppcg.h" +} + +#include "llvm/Support/Debug.h" + +using namespace polly; +using namespace llvm; + +#define DEBUG_TYPE "polly-codegen-ppcg" + +static cl::opt DumpSchedule("polly-acc-dump-schedule", + cl::desc("Dump the computed GPU Schedule"), + cl::Hidden, cl::cat(PollyCategory)); + +static cl::opt + DumpCode("polly-acc-dump-code", + cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, + cl::cat(PollyCategory)); + +static cl::opt DumpKernelIR("polly-acc-dump-kernel-ir", + cl::desc("Dump the kernel LLVM-IR"), + cl::Hidden, cl::cat(PollyCategory)); + +static cl::opt DumpKernelASM("polly-acc-dump-kernel-asm", + cl::desc("Dump the kernel assembly code"), + cl::Hidden, cl::cat(PollyCategory)); + +static cl::opt FastMath("polly-acc-fastmath", + cl::desc("Allow unsafe math optimizations"), + cl::Hidden, cl::cat(PollyCategory)); +static cl::opt SharedMemory("polly-acc-use-shared", + cl::desc("Use shared memory"), cl::Hidden, + cl::cat(PollyCategory)); +static cl::opt PrivateMemory("polly-acc-use-private", + cl::desc("Use private memory"), cl::Hidden, + cl::cat(PollyCategory)); + +bool polly::PollyManagedMemory; +static cl::opt + XManagedMemory("polly-acc-codegen-managed-memory", + cl::desc("Generate Host kernel code assuming" + " that all memory has been" + " declared as managed memory"), + cl::location(PollyManagedMemory), cl::Hidden, + cl::init(false), cl::cat(PollyCategory)); + +static cl::opt + FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure", + cl::desc("Fail and generate a backtrace if" + " verifyModule fails on the GPU " + " kernel module."), + cl::Hidden, cl::cat(PollyCategory)); + +static cl::opt CUDALibDevice( + "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden, + cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"), + cl::cat(PollyCategory)); + +static cl::opt + CudaVersion("polly-acc-cuda-version", + cl::desc("The CUDA version to compile for"), cl::Hidden, + cl::init("sm_30"), cl::cat(PollyCategory)); + +static cl::opt + MinCompute("polly-acc-mincompute", + cl::desc("Minimal number of compute statements to run on GPU."), + cl::Hidden, cl::init(10 * 512 * 512)); + +GPURuntime polly::GPURuntimeChoice; +static cl::opt + XGPURuntimeChoice("polly-gpu-runtime", + cl::desc("The GPU Runtime API to target"), + cl::values(clEnumValN(GPURuntime::CUDA, "libcudart", + "use the CUDA Runtime API"), + clEnumValN(GPURuntime::OpenCL, "libopencl", + "use the OpenCL Runtime API")), + cl::location(polly::GPURuntimeChoice), + cl::init(GPURuntime::CUDA), cl::cat(PollyCategory)); + +GPUArch polly::GPUArchChoice; +static cl::opt + XGPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), + cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", + "target NVIDIA 64-bit architecture"), + clEnumValN(GPUArch::SPIR32, "spir32", + "target SPIR 32-bit architecture"), + clEnumValN(GPUArch::SPIR64, "spir64", + "target SPIR 64-bit architecture")), + cl::location(polly::GPUArchChoice), + cl::init(GPUArch::NVPTX64), cl::cat(PollyCategory)); + +extern bool polly::PerfMonitoring; + +/// Return a unique name for a Scop, which is the scop region with the +/// function name. +std::string getUniqueScopName(const Scop *S) { + return "Scop Region: " + S->getNameStr() + + " | Function: " + std::string(S->getFunction().getName()); +} + +/// Used to store information PPCG wants for kills. This information is +/// used by live range reordering. +/// +/// @see computeLiveRangeReordering +/// @see GPUNodeBuilder::createPPCGScop +/// @see GPUNodeBuilder::createPPCGProg +struct MustKillsInfo { + /// Collection of all kill statements that will be sequenced at the end of + /// PPCGScop->schedule. + /// + /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set` + /// which merges schedules in *arbitrary* order. + /// (we don't care about the order of the kills anyway). + isl::schedule KillsSchedule; + /// Map from kill statement instances to scalars that need to be + /// killed. + /// + /// We currently derive kill information for: + /// 1. phi nodes. PHI nodes are not alive outside the scop and can + /// consequently all be killed. + /// 2. Scalar arrays that are not used outside the Scop. This is + /// checked by `isScalarUsesContainedInScop`. + /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } + isl::union_map TaggedMustKills; + + /// Tagged must kills stripped of the tags. + /// [params] -> { Stmt_phantom[] -> scalar_to_kill[] } + isl::union_map MustKills; + + MustKillsInfo() : KillsSchedule() {} +}; + +/// Check if SAI's uses are entirely contained within Scop S. +/// If a scalar is used only with a Scop, we are free to kill it, as no data +/// can flow in/out of the value any more. +/// @see computeMustKillsInfo +static bool isScalarUsesContainedInScop(const Scop &S, + const ScopArrayInfo *SAI) { + assert(SAI->isValueKind() && "this function only deals with scalars." + " Dealing with arrays required alias analysis"); + + const Region &R = S.getRegion(); + for (User *U : SAI->getBasePtr()->users()) { + Instruction *I = dyn_cast(U); + assert(I && "invalid user of scop array info"); + if (!R.contains(I)) + return false; + } + return true; +} + +/// Compute must-kills needed to enable live range reordering with PPCG. +/// +/// @params S The Scop to compute live range reordering information +/// @returns live range reordering information that can be used to setup +/// PPCG. +static MustKillsInfo computeMustKillsInfo(const Scop &S) { + const isl::space ParamSpace = S.getParamSpace(); + MustKillsInfo Info; + + // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria: + // 1.1 phi nodes in scop. + // 1.2 scalars that are only used within the scop + SmallVector KillMemIds; + for (ScopArrayInfo *SAI : S.arrays()) { + if (SAI->isPHIKind() || + (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI))) + KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release())); + } + + Info.TaggedMustKills = isl::union_map::empty(ParamSpace.ctx()); + Info.MustKills = isl::union_map::empty(ParamSpace.ctx()); + + // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the + // schedule: + // - filter: "[control] -> { }" + // So, we choose to not create this to keep the output a little nicer, + // at the cost of some code complexity. + Info.KillsSchedule = {}; + + for (isl::id &ToKillId : KillMemIds) { + isl::id KillStmtId = isl::id::alloc( + S.getIslCtx(), + std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr); + + // NOTE: construction of tagged_must_kill: + // 2. We need to construct a map: + // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } + // To construct this, we use `isl_map_domain_product` on 2 maps`: + // 2a. StmtToScalar: + // [param] -> { Stmt_phantom[] -> scalar_to_kill[] } + // 2b. PhantomRefToScalar: + // [param] -> { ref_phantom[] -> scalar_to_kill[] } + // + // Combining these with `isl_map_domain_product` gives us + // TaggedMustKill: + // [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } + + // 2a. [param] -> { Stmt[] -> scalar_to_kill[] } + isl::map StmtToScalar = isl::map::universe(ParamSpace); + StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId)); + StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId)); + + isl::id PhantomRefId = isl::id::alloc( + S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(), + nullptr); + + // 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] } + isl::map PhantomRefToScalar = isl::map::universe(ParamSpace); + PhantomRefToScalar = + PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId); + PhantomRefToScalar = + PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId); + + // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } + isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar); + Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill); + + // 2. [param] -> { Stmt[] -> scalar_to_kill[] } + Info.MustKills = Info.TaggedMustKills.domain_factor_domain(); + + // 3. Create the kill schedule of the form: + // "[param] -> { Stmt_phantom[] }" + // Then add this to Info.KillsSchedule. + isl::space KillStmtSpace = ParamSpace; + KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId); + isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace); + + isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain); + if (!Info.KillsSchedule.is_null()) + Info.KillsSchedule = isl::manage( + isl_schedule_set(Info.KillsSchedule.release(), KillSchedule.copy())); + else + Info.KillsSchedule = KillSchedule; + } + + return Info; +} + +/// Create the ast expressions for a ScopStmt. +/// +/// This function is a callback for to generate the ast expressions for each +/// of the scheduled ScopStmts. +static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( + void *StmtT, __isl_take isl_ast_build *Build_C, + isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, + isl_id *Id, void *User), + void *UserIndex, + isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), + void *UserExpr) { + + ScopStmt *Stmt = (ScopStmt *)StmtT; + + if (!Stmt || !Build_C) + return NULL; + + isl::ast_build Build = isl::manage_copy(Build_C); + isl::ctx Ctx = Build.ctx(); + isl::id_to_ast_expr RefToExpr = isl::id_to_ast_expr::alloc(Ctx, 0); + + Stmt->setAstBuild(Build); + + for (MemoryAccess *Acc : *Stmt) { + isl::map AddrFunc = Acc->getAddressFunction(); + AddrFunc = AddrFunc.intersect_domain(Stmt->getDomain()); + + isl::id RefId = Acc->getId(); + isl::pw_multi_aff PMA = isl::pw_multi_aff::from_map(AddrFunc); + + isl::multi_pw_aff MPA = isl::multi_pw_aff(PMA); + MPA = MPA.coalesce(); + MPA = isl::manage(FunctionIndex(MPA.release(), RefId.get(), UserIndex)); + + isl::ast_expr Access = Build.access_from(MPA); + Access = isl::manage(FunctionExpr(Access.release(), RefId.get(), UserExpr)); + RefToExpr = RefToExpr.set(RefId, Access); + } + + return RefToExpr.release(); +} + +/// Given a LLVM Type, compute its size in bytes, +static int computeSizeInBytes(const Type *T) { + int bytes = T->getPrimitiveSizeInBits() / 8; + if (bytes == 0) + bytes = T->getScalarSizeInBits() / 8; + return bytes; +} + +/// Generate code for a GPU specific isl AST. +/// +/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which +/// generates code for general-purpose AST nodes, with special functionality +/// for generating GPU specific user nodes. +/// +/// @see GPUNodeBuilder::createUser +class GPUNodeBuilder final : public IslNodeBuilder { +public: + GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, + const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, + DominatorTree &DT, Scop &S, BasicBlock *StartBlock, + gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) + : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), + Prog(Prog), Runtime(Runtime), Arch(Arch) { + getExprBuilder().setIDToSAI(&IDToSAI); + } + + /// Create after-run-time-check initialization code. + void initializeAfterRTH(); + + /// Finalize the generated scop. + void finalize() override; + + /// Track if the full build process was successful. + /// + /// This value is set to false, if throughout the build process an error + /// occurred which prevents us from generating valid GPU code. + bool BuildSuccessful = true; + + /// The maximal number of loops surrounding a sequential kernel. + unsigned DeepestSequential = 0; + + /// The maximal number of loops surrounding a parallel kernel. + unsigned DeepestParallel = 0; + + /// Return the name to set for the ptx_kernel. + std::string getKernelFuncName(int Kernel_id); + +private: + /// A vector of array base pointers for which a new ScopArrayInfo was created. + /// + /// This vector is used to delete the ScopArrayInfo when it is not needed any + /// more. + std::vector LocalArrays; + + /// A map from ScopArrays to their corresponding device allocations. + std::map DeviceAllocations; + + /// The current GPU context. + Value *GPUContext; + + /// The set of isl_ids allocated in the kernel + std::vector KernelIds; + + /// A module containing GPU code. + /// + /// This pointer is only set in case we are currently generating GPU code. + std::unique_ptr GPUModule; + + /// The GPU program we generate code for. + gpu_prog *Prog; + + /// The GPU Runtime implementation to use (OpenCL or CUDA). + GPURuntime Runtime; + + /// The GPU Architecture to target. + GPUArch Arch; + + /// Class to free isl_ids. + class IslIdDeleter final { + public: + void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; + }; + + /// A set containing all isl_ids allocated in a GPU kernel. + /// + /// By releasing this set all isl_ids will be freed. + std::set> KernelIDs; + + IslExprBuilder::IDToScopArrayInfoTy IDToSAI; + + /// Create code for user-defined AST nodes. + /// + /// These AST nodes can be of type: + /// + /// - ScopStmt: A computational statement (TODO) + /// - Kernel: A GPU kernel call (TODO) + /// - Data-Transfer: A GPU <-> CPU data-transfer + /// - In-kernel synchronization + /// - In-kernel memory copy statement + /// + /// @param UserStmt The ast node to generate code for. + void createUser(__isl_take isl_ast_node *UserStmt) override; + + void createFor(__isl_take isl_ast_node *Node) override; + + enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; + + /// Create code for a data transfer statement + /// + /// @param TransferStmt The data transfer statement. + /// @param Direction The direction in which to transfer data. + void createDataTransfer(__isl_take isl_ast_node *TransferStmt, + enum DataDirection Direction); + + /// Find llvm::Values referenced in GPU kernel. + /// + /// @param Kernel The kernel to scan for llvm::Values + /// + /// @returns A tuple, whose: + /// - First element contains the set of values referenced by the + /// kernel + /// - Second element contains the set of functions referenced by the + /// kernel. All functions in the set satisfy + /// `isValidFunctionInKernel`. + /// - Third element contains loops that have induction variables + /// which are used in the kernel, *and* these loops are *neither* + /// in the scop, nor do they immediately surroung the Scop. + /// See [Code generation of induction variables of loops outside + /// Scops] + std::tuple, SetVector, SetVector, + isl::space> + getReferencesInKernel(ppcg_kernel *Kernel); + + /// Compute the sizes of the execution grid for a given kernel. + /// + /// @param Kernel The kernel to compute grid sizes for. + /// + /// @returns A tuple with grid sizes for X and Y dimension + std::tuple getGridSizes(ppcg_kernel *Kernel); + + /// Get the managed array pointer for sending host pointers to the device. + /// \note + /// This is to be used only with managed memory + Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo); + + /// Compute the sizes of the thread blocks for a given kernel. + /// + /// @param Kernel The kernel to compute thread block sizes for. + /// + /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. + std::tuple getBlockSizes(ppcg_kernel *Kernel); + + /// Store a specific kernel launch parameter in the array of kernel launch + /// parameters. + /// + /// @param ArrayTy Array type of \p Parameters. + /// @param Parameters The list of parameters in which to store. + /// @param Param The kernel launch parameter to store. + /// @param Index The index in the parameter list, at which to store the + /// parameter. + void insertStoreParameter(Type *ArrayTy, Instruction *Parameters, + Instruction *Param, int Index); + + /// Create kernel launch parameters. + /// + /// @param Kernel The kernel to create parameters for. + /// @param F The kernel function that has been created. + /// @param SubtreeValues The set of llvm::Values referenced by this kernel. + /// + /// @returns A stack allocated array with pointers to the parameter + /// values that are passed to the kernel. + Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, + SetVector SubtreeValues); + + /// Create declarations for kernel variable. + /// + /// This includes shared memory declarations. + /// + /// @param Kernel The kernel definition to create variables for. + /// @param FN The function into which to generate the variables. + void createKernelVariables(ppcg_kernel *Kernel, Function *FN); + + /// Add CUDA annotations to module. + /// + /// Add a set of CUDA annotations that declares the maximal block dimensions + /// that will be used to execute the CUDA kernel. This allows the NVIDIA + /// PTX compiler to bound the number of allocated registers to ensure the + /// resulting kernel is known to run with up to as many block dimensions + /// as specified here. + /// + /// @param M The module to add the annotations to. + /// @param BlockDimX The size of block dimension X. + /// @param BlockDimY The size of block dimension Y. + /// @param BlockDimZ The size of block dimension Z. + void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, + Value *BlockDimZ); + + /// Create GPU kernel. + /// + /// Code generate the kernel described by @p KernelStmt. + /// + /// @param KernelStmt The ast node to generate kernel code for. + void createKernel(__isl_take isl_ast_node *KernelStmt); + + /// Generate code that computes the size of an array. + /// + /// @param Array The array for which to compute a size. + Value *getArraySize(gpu_array_info *Array); + + /// Generate code to compute the minimal offset at which an array is accessed. + /// + /// The offset of an array is the minimal array location accessed in a scop. + /// + /// Example: + /// + /// for (long i = 0; i < 100; i++) + /// A[i + 42] += ... + /// + /// getArrayOffset(A) results in 42. + /// + /// @param Array The array for which to compute the offset. + /// @returns An llvm::Value that contains the offset of the array. + Value *getArrayOffset(gpu_array_info *Array); + + /// Prepare the kernel arguments for kernel code generation + /// + /// @param Kernel The kernel to generate code for. + /// @param FN The function created for the kernel. + void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); + + /// Create kernel function. + /// + /// Create a kernel function located in a newly created module that can serve + /// as target for device code generation. Set the Builder to point to the + /// start block of this newly created function. + /// + /// @param Kernel The kernel to generate code for. + /// @param SubtreeValues The set of llvm::Values referenced by this kernel. + /// @param SubtreeFunctions The set of llvm::Functions referenced by this + /// kernel. + void createKernelFunction(ppcg_kernel *Kernel, + SetVector &SubtreeValues, + SetVector &SubtreeFunctions); + + /// Create the declaration of a kernel function. + /// + /// The kernel function takes as arguments: + /// + /// - One i8 pointer for each external array reference used in the kernel. + /// - Host iterators + /// - Parameters + /// - Other LLVM Value references (TODO) + /// + /// @param Kernel The kernel to generate the function declaration for. + /// @param SubtreeValues The set of llvm::Values referenced by this kernel. + /// + /// @returns The newly declared function. + Function *createKernelFunctionDecl(ppcg_kernel *Kernel, + SetVector &SubtreeValues); + + /// Insert intrinsic functions to obtain thread and block ids. + /// + /// @param The kernel to generate the intrinsic functions for. + void insertKernelIntrinsics(ppcg_kernel *Kernel); + + /// Insert function calls to retrieve the SPIR group/local ids. + /// + /// @param Kernel The kernel to generate the function calls for. + /// @param SizeTypeIs64Bit Whether size_t of the openCl device is 64bit. + void insertKernelCallsSPIR(ppcg_kernel *Kernel, bool SizeTypeIs64bit); + + /// Setup the creation of functions referenced by the GPU kernel. + /// + /// 1. Create new function declarations in GPUModule which are the same as + /// SubtreeFunctions. + /// + /// 2. Populate IslNodeBuilder::ValueMap with mappings from + /// old functions (that come from the original module) to new functions + /// (that are created within GPUModule). That way, we generate references + /// to the correct function (in GPUModule) in BlockGenerator. + /// + /// @see IslNodeBuilder::ValueMap + /// @see BlockGenerator::GlobalMap + /// @see BlockGenerator::getNewValue + /// @see GPUNodeBuilder::getReferencesInKernel. + /// + /// @param SubtreeFunctions The set of llvm::Functions referenced by + /// this kernel. + void setupKernelSubtreeFunctions(SetVector SubtreeFunctions); + + /// Create a global-to-shared or shared-to-global copy statement. + /// + /// @param CopyStmt The copy statement to generate code for + void createKernelCopy(ppcg_kernel_stmt *CopyStmt); + + /// Create code for a ScopStmt called in @p Expr. + /// + /// @param Expr The expression containing the call. + /// @param KernelStmt The kernel statement referenced in the call. + void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); + + /// Create an in-kernel synchronization call. + void createKernelSync(); + + /// Create a PTX assembly string for the current GPU kernel. + /// + /// @returns A string containing the corresponding PTX assembly code. + std::string createKernelASM(); + + /// Remove references from the dominator tree to the kernel function @p F. + /// + /// @param F The function to remove references to. + void clearDominators(Function *F); + + /// Remove references from scalar evolution to the kernel function @p F. + /// + /// @param F The function to remove references to. + void clearScalarEvolution(Function *F); + + /// Remove references from loop info to the kernel function @p F. + /// + /// @param F The function to remove references to. + void clearLoops(Function *F); + + /// Check if the scop requires to be linked with CUDA's libdevice. + bool requiresCUDALibDevice(); + + /// Link with the NVIDIA libdevice library (if needed and available). + void addCUDALibDevice(); + + /// Finalize the generation of the kernel function. + /// + /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- + /// dump its IR to stderr. + /// + /// @returns The Assembly string of the kernel. + std::string finalizeKernelFunction(); + + /// Finalize the generation of the kernel arguments. + /// + /// This function ensures that not-read-only scalars used in a kernel are + /// stored back to the global memory location they are backed with before + /// the kernel terminates. + /// + /// @params Kernel The kernel to finalize kernel arguments for. + void finalizeKernelArguments(ppcg_kernel *Kernel); + + /// Create code that allocates memory to store arrays on device. + void allocateDeviceArrays(); + + /// Create code to prepare the managed device pointers. + void prepareManagedDeviceArrays(); + + /// Free all allocated device arrays. + void freeDeviceArrays(); + + /// Create a call to initialize the GPU context. + /// + /// @returns A pointer to the newly initialized context. + Value *createCallInitContext(); + + /// Create a call to get the device pointer for a kernel allocation. + /// + /// @param Allocation The Polly GPU allocation + /// + /// @returns The device parameter corresponding to this allocation. + Value *createCallGetDevicePtr(Value *Allocation); + + /// Create a call to free the GPU context. + /// + /// @param Context A pointer to an initialized GPU context. + void createCallFreeContext(Value *Context); + + /// Create a call to allocate memory on the device. + /// + /// @param Size The size of memory to allocate + /// + /// @returns A pointer that identifies this allocation. + Value *createCallAllocateMemoryForDevice(Value *Size); + + /// Create a call to free a device array. + /// + /// @param Array The device array to free. + void createCallFreeDeviceMemory(Value *Array); + + /// Create a call to copy data from host to device. + /// + /// @param HostPtr A pointer to the host data that should be copied. + /// @param DevicePtr A device pointer specifying the location to copy to. + void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, + Value *Size); + + /// Create a call to copy data from device to host. + /// + /// @param DevicePtr A pointer to the device data that should be copied. + /// @param HostPtr A host pointer specifying the location to copy to. + void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, + Value *Size); + + /// Create a call to synchronize Host & Device. + /// \note + /// This is to be used only with managed memory. + void createCallSynchronizeDevice(); + + /// Create a call to get a kernel from an assembly string. + /// + /// @param Buffer The string describing the kernel. + /// @param Entry The name of the kernel function to call. + /// + /// @returns A pointer to a kernel object + Value *createCallGetKernel(Value *Buffer, Value *Entry); + + /// Create a call to free a GPU kernel. + /// + /// @param GPUKernel THe kernel to free. + void createCallFreeKernel(Value *GPUKernel); + + /// Create a call to launch a GPU kernel. + /// + /// @param GPUKernel The kernel to launch. + /// @param GridDimX The size of the first grid dimension. + /// @param GridDimY The size of the second grid dimension. + /// @param GridBlockX The size of the first block dimension. + /// @param GridBlockY The size of the second block dimension. + /// @param GridBlockZ The size of the third block dimension. + /// @param Parameters A pointer to an array that contains itself pointers to + /// the parameter values passed for each kernel argument. + void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, + Value *GridDimY, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ, + Value *Parameters); +}; + +std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) { + return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" + + std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id); +} + +void GPUNodeBuilder::initializeAfterRTH() { + BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), + &*Builder.GetInsertPoint(), &DT, &LI); + NewBB->setName("polly.acc.initialize"); + Builder.SetInsertPoint(&NewBB->front()); + + GPUContext = createCallInitContext(); + + if (!PollyManagedMemory) + allocateDeviceArrays(); + else + prepareManagedDeviceArrays(); +} + +void GPUNodeBuilder::finalize() { + if (!PollyManagedMemory) + freeDeviceArrays(); + + createCallFreeContext(GPUContext); + IslNodeBuilder::finalize(); +} + +void GPUNodeBuilder::allocateDeviceArrays() { + assert(!PollyManagedMemory && + "Managed memory will directly send host pointers " + "to the kernel. There is no need for device arrays"); + isl_ast_build *Build = isl_ast_build_from_context(S.getContext().release()); + + for (int i = 0; i < Prog->n_array; ++i) { + gpu_array_info *Array = &Prog->array[i]; + auto *ScopArray = (ScopArrayInfo *)Array->user; + std::string DevArrayName("p_dev_array_"); + DevArrayName.append(Array->name); + + Value *ArraySize = getArraySize(Array); + Value *Offset = getArrayOffset(Array); + if (Offset) + ArraySize = Builder.CreateSub( + ArraySize, + Builder.CreateMul(Offset, + Builder.getInt64(ScopArray->getElemSizeInBytes()))); + const SCEV *SizeSCEV = SE.getSCEV(ArraySize); + // It makes no sense to have an array of size 0. The CUDA API will + // throw an error anyway if we invoke `cuMallocManaged` with size `0`. We + // choose to be defensive and catch this at the compile phase. It is + // most likely that we are doing something wrong with size computation. + if (SizeSCEV->isZero()) { + errs() << getUniqueScopName(&S) + << " has computed array size 0: " << *ArraySize + << " | for array: " << *(ScopArray->getBasePtr()) + << ". This is illegal, exiting.\n"; + report_fatal_error("array size was computed to be 0"); + } + + Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); + DevArray->setName(DevArrayName); + DeviceAllocations[ScopArray] = DevArray; + } + + isl_ast_build_free(Build); +} + +void GPUNodeBuilder::prepareManagedDeviceArrays() { + assert(PollyManagedMemory && + "Device array most only be prepared in managed-memory mode"); + for (int i = 0; i < Prog->n_array; ++i) { + gpu_array_info *Array = &Prog->array[i]; + ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user; + Value *HostPtr; + + if (gpu_array_is_scalar(Array)) + HostPtr = BlockGen.getOrCreateAlloca(ScopArray); + else + HostPtr = ScopArray->getBasePtr(); + HostPtr = getLatestValue(HostPtr); + + Value *Offset = getArrayOffset(Array); + if (Offset) { + HostPtr = Builder.CreatePointerCast( + HostPtr, ScopArray->getElementType()->getPointerTo()); + HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset); + } + + HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); + DeviceAllocations[ScopArray] = HostPtr; + } +} + +void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ) { + auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); + + for (auto &F : *M) { + if (F.getCallingConv() != CallingConv::PTX_Kernel) + continue; + + Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; + + Metadata *Elements[] = { + ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), + ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), + ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), + ValueAsMetadata::get(V[2]), + }; + MDNode *Node = MDNode::get(M->getContext(), Elements); + AnnotationNode->addOperand(Node); + } +} + +void GPUNodeBuilder::freeDeviceArrays() { + assert(!PollyManagedMemory && "Managed memory does not use device arrays"); + for (auto &Array : DeviceAllocations) + createCallFreeDeviceMemory(Array.second); +} + +Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { + const char *Name = "polly_getKernel"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return Builder.CreateCall(F, {Buffer, Entry}); +} + +Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { + const char *Name = "polly_getDevicePtr"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return Builder.CreateCall(F, {Allocation}); +} + +void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, + Value *GridDimY, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ, + Value *Parameters) { + const char *Name = "polly_launchKernel"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters}); +} + +void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { + const char *Name = "polly_freeKernel"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {GPUKernel}); +} + +void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { + assert(!PollyManagedMemory && + "Managed memory does not allocate or free memory " + "for device"); + const char *Name = "polly_freeDeviceMemory"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {Array}); +} + +Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { + assert(!PollyManagedMemory && + "Managed memory does not allocate or free memory " + "for device"); + const char *Name = "polly_allocateMemoryForDevice"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt64Ty()); + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return Builder.CreateCall(F, {Size}); +} + +void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, + Value *DeviceData, + Value *Size) { + assert(!PollyManagedMemory && + "Managed memory does not transfer memory between " + "device and host"); + const char *Name = "polly_copyFromHostToDevice"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt64Ty()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {HostData, DeviceData, Size}); +} + +void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, + Value *HostData, + Value *Size) { + assert(!PollyManagedMemory && + "Managed memory does not transfer memory between " + "device and host"); + const char *Name = "polly_copyFromDeviceToHost"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt64Ty()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {DeviceData, HostData, Size}); +} + +void GPUNodeBuilder::createCallSynchronizeDevice() { + assert(PollyManagedMemory && "explicit synchronization is only necessary for " + "managed memory"); + const char *Name = "polly_synchronizeDevice"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F); +} + +Value *GPUNodeBuilder::createCallInitContext() { + const char *Name; + + switch (Runtime) { + case GPURuntime::CUDA: + Name = "polly_initContextCUDA"; + break; + case GPURuntime::OpenCL: + Name = "polly_initContextCL"; + break; + } + + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return Builder.CreateCall(F, {}); +} + +void GPUNodeBuilder::createCallFreeContext(Value *Context) { + const char *Name = "polly_freeContext"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {Context}); +} + +/// Check if one string is a prefix of another. +/// +/// @param String The string in which to look for the prefix. +/// @param Prefix The prefix to look for. +static bool isPrefix(std::string String, std::string Prefix) { + return String.find(Prefix) == 0; +} + +Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { + isl::ast_build Build = isl::ast_build::from_context(S.getContext()); + Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); + + if (!gpu_array_is_scalar(Array)) { + isl::multi_pw_aff ArrayBound = isl::manage_copy(Array->bound); + + isl::pw_aff OffsetDimZero = ArrayBound.at(0); + isl::ast_expr Res = Build.expr_from(OffsetDimZero); + + for (unsigned int i = 1; i < Array->n_index; i++) { + isl::pw_aff Bound_I = ArrayBound.at(i); + isl::ast_expr Expr = Build.expr_from(Bound_I); + Res = Res.mul(Expr); + } + + Value *NumElements = ExprBuilder.create(Res.release()); + if (NumElements->getType() != ArraySize->getType()) + NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); + ArraySize = Builder.CreateMul(ArraySize, NumElements); + } + return ArraySize; +} + +Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { + if (gpu_array_is_scalar(Array)) + return nullptr; + + isl::ast_build Build = isl::ast_build::from_context(S.getContext()); + + isl::set Min = isl::manage_copy(Array->extent).lexmin(); + + isl::set ZeroSet = isl::set::universe(Min.get_space()); + + for (unsigned i : rangeIslSize(0, Min.tuple_dim())) + ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0); + + if (Min.is_subset(ZeroSet)) { + return nullptr; + } + + isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.ctx(), 0)); + + for (unsigned i : rangeIslSize(0, Min.tuple_dim())) { + if (i > 0) { + isl::pw_aff Bound_I = + isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1)); + isl::ast_expr BExpr = Build.expr_from(Bound_I); + Result = Result.mul(BExpr); + } + isl::pw_aff DimMin = Min.dim_min(i); + isl::ast_expr MExpr = Build.expr_from(DimMin); + Result = Result.add(MExpr); + } + + return ExprBuilder.create(Result.release()); +} + +Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array, + ScopArrayInfo *ArrayInfo) { + assert(PollyManagedMemory && "Only used when you wish to get a host " + "pointer for sending data to the kernel, " + "with managed memory"); + std::map::iterator it; + it = DeviceAllocations.find(ArrayInfo); + assert(it != DeviceAllocations.end() && + "Device array expected to be available"); + return it->second; +} + +void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, + enum DataDirection Direction) { + assert(!PollyManagedMemory && "Managed memory needs no data transfers"); + isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); + isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); + isl_id *Id = isl_ast_expr_get_id(Arg); + auto Array = (gpu_array_info *)isl_id_get_user(Id); + auto ScopArray = (ScopArrayInfo *)(Array->user); + + Value *Size = getArraySize(Array); + Value *Offset = getArrayOffset(Array); + Value *DevPtr = DeviceAllocations[ScopArray]; + + Value *HostPtr; + + if (gpu_array_is_scalar(Array)) + HostPtr = BlockGen.getOrCreateAlloca(ScopArray); + else + HostPtr = ScopArray->getBasePtr(); + HostPtr = getLatestValue(HostPtr); + + if (Offset) { + HostPtr = Builder.CreatePointerCast( + HostPtr, ScopArray->getElementType()->getPointerTo()); + HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset); + } + + HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); + + if (Offset) { + Size = Builder.CreateSub( + Size, Builder.CreateMul( + Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); + } + + if (Direction == HOST_TO_DEVICE) + createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); + else + createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); + + isl_id_free(Id); + isl_ast_expr_free(Arg); + isl_ast_expr_free(Expr); + isl_ast_node_free(TransferStmt); +} + +void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { + isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); + isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); + isl_id *Id = isl_ast_expr_get_id(StmtExpr); + isl_id_free(Id); + isl_ast_expr_free(StmtExpr); + + const char *Str = isl_id_get_name(Id); + if (!strcmp(Str, "kernel")) { + createKernel(UserStmt); + if (PollyManagedMemory) + createCallSynchronizeDevice(); + isl_ast_expr_free(Expr); + return; + } + if (!strcmp(Str, "init_device")) { + initializeAfterRTH(); + isl_ast_node_free(UserStmt); + isl_ast_expr_free(Expr); + return; + } + if (!strcmp(Str, "clear_device")) { + finalize(); + isl_ast_node_free(UserStmt); + isl_ast_expr_free(Expr); + return; + } + if (isPrefix(Str, "to_device")) { + if (!PollyManagedMemory) + createDataTransfer(UserStmt, HOST_TO_DEVICE); + else + isl_ast_node_free(UserStmt); + + isl_ast_expr_free(Expr); + return; + } + + if (isPrefix(Str, "from_device")) { + if (!PollyManagedMemory) { + createDataTransfer(UserStmt, DEVICE_TO_HOST); + } else { + isl_ast_node_free(UserStmt); + } + isl_ast_expr_free(Expr); + return; + } + + isl_id *Anno = isl_ast_node_get_annotation(UserStmt); + struct ppcg_kernel_stmt *KernelStmt = + (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); + isl_id_free(Anno); + + switch (KernelStmt->type) { + case ppcg_kernel_domain: + createScopStmt(Expr, KernelStmt); + isl_ast_node_free(UserStmt); + return; + case ppcg_kernel_copy: + createKernelCopy(KernelStmt); + isl_ast_expr_free(Expr); + isl_ast_node_free(UserStmt); + return; + case ppcg_kernel_sync: + createKernelSync(); + isl_ast_expr_free(Expr); + isl_ast_node_free(UserStmt); + return; + } + + isl_ast_expr_free(Expr); + isl_ast_node_free(UserStmt); +} + +void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) { + createForSequential(isl::manage(Node).as(), false); +} + +void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { + isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); + auto LocalAddr = ExprBuilder.createAccessAddress(LocalIndex); + isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); + auto GlobalAddr = ExprBuilder.createAccessAddress(Index); + + if (KernelStmt->u.c.read) { + LoadInst *Load = + Builder.CreateLoad(GlobalAddr.second, GlobalAddr.first, "shared.read"); + Builder.CreateStore(Load, LocalAddr.first); + } else { + LoadInst *Load = + Builder.CreateLoad(LocalAddr.second, LocalAddr.first, "shared.write"); + Builder.CreateStore(Load, GlobalAddr.first); + } +} + +void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, + ppcg_kernel_stmt *KernelStmt) { + auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; + isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; + + LoopToScevMapT LTS; + LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); + + createSubstitutions(Expr, Stmt, LTS); + + if (Stmt->isBlockStmt()) + BlockGen.copyStmt(*Stmt, LTS, Indexes); + else + RegionGen.copyStmt(*Stmt, LTS, Indexes); +} + +void GPUNodeBuilder::createKernelSync() { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + const char *SpirName = "__gen_ocl_barrier_global"; + + Function *Sync; + + switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + Sync = M->getFunction(SpirName); + + // If Sync is not available, declare it. + if (!Sync) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + Sync = Function::Create(Ty, Linkage, SpirName, M); + Sync->setCallingConv(CallingConv::SPIR_FUNC); + } + break; + case GPUArch::NVPTX64: + Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); + break; + } + + Builder.CreateCall(Sync, {}); +} + +/// Collect llvm::Values referenced from @p Node +/// +/// This function only applies to isl_ast_nodes that are user_nodes referring +/// to a ScopStmt. All other node types are ignore. +/// +/// @param Node The node to collect references for. +/// @param User A user pointer used as storage for the data that is collected. +/// +/// @returns isl_bool_true if data could be collected successfully. +isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { + if (isl_ast_node_get_type(Node) != isl_ast_node_user) + return isl_bool_true; + + isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); + isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); + isl_id *Id = isl_ast_expr_get_id(StmtExpr); + const char *Str = isl_id_get_name(Id); + isl_id_free(Id); + isl_ast_expr_free(StmtExpr); + isl_ast_expr_free(Expr); + + if (!isPrefix(Str, "Stmt")) + return isl_bool_true; + + Id = isl_ast_node_get_annotation(Node); + auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); + auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; + isl_id_free(Id); + + addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); + + return isl_bool_true; +} + +/// A list of functions that are available in NVIDIA's libdevice. +const std::set CUDALibDeviceFunctions = { + "exp", "expf", "expl", "cos", "cosf", "sqrt", "sqrtf", + "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"}; + +// A map from intrinsics to their corresponding libdevice functions. +const std::map IntrinsicToLibdeviceFunc = { + {"llvm.exp.f64", "exp"}, + {"llvm.exp.f32", "expf"}, + {"llvm.powi.f64.i32", "powi"}, + {"llvm.powi.f32.i32", "powif"}}; + +/// Return the corresponding CUDA libdevice function name @p Name. +/// Note that this function will try to convert instrinsics in the list +/// IntrinsicToLibdeviceFunc into libdevice functions. +/// This is because some intrinsics such as `exp` +/// are not supported by the NVPTX backend. +/// If this restriction of the backend is lifted, we should refactor our code +/// so that we use intrinsics whenever possible. +/// +/// Return "" if we are not compiling for CUDA. +std::string getCUDALibDeviceFuntion(StringRef NameRef) { + std::string Name = NameRef.str(); + auto It = IntrinsicToLibdeviceFunc.find(Name); + if (It != IntrinsicToLibdeviceFunc.end()) + return getCUDALibDeviceFuntion(It->second); + + if (CUDALibDeviceFunctions.count(Name)) + return ("__nv_" + Name); + + return ""; +} + +/// Check if F is a function that we can code-generate in a GPU kernel. +static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) { + assert(F && "F is an invalid pointer"); + // We string compare against the name of the function to allow + // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and + // "llvm.copysign". + const StringRef Name = F->getName(); + + if (AllowLibDevice && getCUDALibDeviceFuntion(Name).length() > 0) + return true; + + return F->isIntrinsic() && + (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || + Name.startswith("llvm.copysign")); +} + +/// Do not take `Function` as a subtree value. +/// +/// We try to take the reference of all subtree values and pass them along +/// to the kernel from the host. Taking an address of any function and +/// trying to pass along is nonsensical. Only allow `Value`s that are not +/// `Function`s. +static bool isValidSubtreeValue(llvm::Value *V) { return !isa(V); } + +/// Return `Function`s from `RawSubtreeValues`. +static SetVector +getFunctionsFromRawSubtreeValues(SetVector RawSubtreeValues, + bool AllowCUDALibDevice) { + SetVector SubtreeFunctions; + for (Value *It : RawSubtreeValues) { + Function *F = dyn_cast(It); + if (F) { + assert(isValidFunctionInKernel(F, AllowCUDALibDevice) && + "Code should have bailed out by " + "this point if an invalid function " + "were present in a kernel."); + SubtreeFunctions.insert(F); + } + } + return SubtreeFunctions; +} + +std::tuple, SetVector, SetVector, + isl::space> +GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { + SetVector SubtreeValues; + SetVector SCEVs; + SetVector Loops; + isl::space ParamSpace = isl::space(S.getIslCtx(), 0, 0).params(); + SubtreeReferences References = { + LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(), + &ParamSpace}; + + for (const auto &I : IDToValue) + SubtreeValues.insert(I.second); + + // NOTE: this is populated in IslNodeBuilder::addParameters + // See [Code generation of induction variables of loops outside Scops]. + for (const auto &I : OutsideLoopIterations) + SubtreeValues.insert(cast(I.second)->getValue()); + + isl_ast_node_foreach_descendant_top_down( + Kernel->tree, collectReferencesInGPUStmt, &References); + + for (const SCEV *Expr : SCEVs) { + findValues(Expr, SE, SubtreeValues); + findLoops(Expr, Loops); + } + + Loops.remove_if([this](const Loop *L) { + return S.contains(L) || L->contains(S.getEntry()); + }); + + for (auto &SAI : S.arrays()) + SubtreeValues.remove(SAI->getBasePtr()); + + isl_space *Space = S.getParamSpace().release(); + for (long i = 0, n = isl_space_dim(Space, isl_dim_param); i < n; i++) { + isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); + assert(IDToValue.count(Id)); + Value *Val = IDToValue[Id]; + SubtreeValues.remove(Val); + isl_id_free(Id); + } + isl_space_free(Space); + + for (long i = 0, n = isl_space_dim(Kernel->space, isl_dim_set); i < n; i++) { + isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); + assert(IDToValue.count(Id)); + Value *Val = IDToValue[Id]; + SubtreeValues.remove(Val); + isl_id_free(Id); + } + + // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions + // SubtreeValues. This is important, because we should not lose any + // SubtreeValues in the process of constructing the + // "ValidSubtree{Values, Functions} sets. Nor should the set + // ValidSubtree{Values, Functions} have any common element. + auto ValidSubtreeValuesIt = + make_filter_range(SubtreeValues, isValidSubtreeValue); + SetVector ValidSubtreeValues(ValidSubtreeValuesIt.begin(), + ValidSubtreeValuesIt.end()); + + bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64; + + SetVector ValidSubtreeFunctions( + getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice)); + + // @see IslNodeBuilder::getReferencesInSubtree + SetVector ReplacedValues; + for (Value *V : ValidSubtreeValues) { + auto It = ValueMap.find(V); + if (It == ValueMap.end()) + ReplacedValues.insert(V); + else + ReplacedValues.insert(It->second); + } + return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops, + ParamSpace); +} + +void GPUNodeBuilder::clearDominators(Function *F) { + DomTreeNode *N = DT.getNode(&F->getEntryBlock()); + std::vector Nodes; + for (po_iterator I = po_begin(N), E = po_end(N); I != E; ++I) + Nodes.push_back(I->getBlock()); + + for (BasicBlock *BB : Nodes) + DT.eraseNode(BB); +} + +void GPUNodeBuilder::clearScalarEvolution(Function *F) { + for (BasicBlock &BB : *F) { + Loop *L = LI.getLoopFor(&BB); + if (L) + SE.forgetLoop(L); + } +} + +void GPUNodeBuilder::clearLoops(Function *F) { + SmallSet WorkList; + for (BasicBlock &BB : *F) { + Loop *L = LI.getLoopFor(&BB); + if (L) + WorkList.insert(L); + } + for (auto *L : WorkList) + LI.erase(L); +} + +std::tuple GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { + std::vector Sizes; + isl::ast_build Context = isl::ast_build::from_context(S.getContext()); + + isl::multi_pw_aff GridSizePwAffs = isl::manage_copy(Kernel->grid_size); + for (long i = 0; i < Kernel->n_grid; i++) { + isl::pw_aff Size = GridSizePwAffs.at(i); + isl::ast_expr GridSize = Context.expr_from(Size); + Value *Res = ExprBuilder.create(GridSize.release()); + Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); + Sizes.push_back(Res); + } + + for (long i = Kernel->n_grid; i < 3; i++) + Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); + + return std::make_tuple(Sizes[0], Sizes[1]); +} + +std::tuple +GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { + std::vector Sizes; + + for (long i = 0; i < Kernel->n_block; i++) { + Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); + Sizes.push_back(Res); + } + + for (long i = Kernel->n_block; i < 3; i++) + Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); + + return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); +} + +void GPUNodeBuilder::insertStoreParameter(Type *ArrayTy, + Instruction *Parameters, + Instruction *Param, int Index) { + Value *Slot = Builder.CreateGEP( + ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); + Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); + Builder.CreateStore(ParamTyped, Slot); +} + +Value * +GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, + SetVector SubtreeValues) { + const int NumArgs = F->arg_size(); + std::vector ArgSizes(NumArgs); + + // If we are using the OpenCL Runtime, we need to add the kernel argument + // sizes to the end of the launch-parameter list, so OpenCL can determine + // how big the respective kernel arguments are. + // Here we need to reserve adequate space for that. + Type *ArrayTy; + if (Runtime == GPURuntime::OpenCL) + ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); + else + ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumArgs); + + BasicBlock *EntryBlock = + &Builder.GetInsertBlock()->getParent()->getEntryBlock(); + auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); + std::string Launch = "polly_launch_" + std::to_string(Kernel->id); + Instruction *Parameters = new AllocaInst( + ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); + + int Index = 0; + for (long i = 0; i < Prog->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); + + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = SAI->getElemSizeInBytes(); + + Value *DevArray = nullptr; + if (PollyManagedMemory) { + DevArray = getManagedDeviceArray(&Prog->array[i], + const_cast(SAI)); + } else { + DevArray = DeviceAllocations[const_cast(SAI)]; + DevArray = createCallGetDevicePtr(DevArray); + } + assert(DevArray != nullptr && "Array to be offloaded to device not " + "initialized"); + Value *Offset = getArrayOffset(&Prog->array[i]); + + if (Offset) { + DevArray = Builder.CreatePointerCast( + DevArray, SAI->getElementType()->getPointerTo()); + DevArray = Builder.CreateGEP(SAI->getElementType(), DevArray, + Builder.CreateNeg(Offset)); + DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); + } + Value *Slot = Builder.CreateGEP( + ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); + + if (gpu_array_is_read_only_scalar(&Prog->array[i])) { + Value *ValPtr = nullptr; + if (PollyManagedMemory) + ValPtr = DevArray; + else + ValPtr = BlockGen.getOrCreateAlloca(SAI); + + assert(ValPtr != nullptr && "ValPtr that should point to a valid object" + " to be stored into Parameters"); + Value *ValPtrCast = + Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); + Builder.CreateStore(ValPtrCast, Slot); + } else { + Instruction *Param = + new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, + Launch + "_param_" + std::to_string(Index), + EntryBlock->getTerminator()); + Builder.CreateStore(DevArray, Param); + Value *ParamTyped = + Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); + Builder.CreateStore(ParamTyped, Slot); + } + Index++; + } + + int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); + + for (long i = 0; i < NumHostIters; i++) { + isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); + Value *Val = IDToValue[Id]; + isl_id_free(Id); + + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + + Instruction *Param = + new AllocaInst(Val->getType(), AddressSpace, + Launch + "_param_" + std::to_string(Index), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + insertStoreParameter(ArrayTy, Parameters, Param, Index); + Index++; + } + + int NumVars = isl_space_dim(Kernel->space, isl_dim_param); + + for (long i = 0; i < NumVars; i++) { + isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); + Value *Val = IDToValue[Id]; + if (ValueMap.count(Val)) + Val = ValueMap[Val]; + isl_id_free(Id); + + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + + Instruction *Param = + new AllocaInst(Val->getType(), AddressSpace, + Launch + "_param_" + std::to_string(Index), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + insertStoreParameter(ArrayTy, Parameters, Param, Index); + Index++; + } + + for (auto Val : SubtreeValues) { + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + + Instruction *Param = + new AllocaInst(Val->getType(), AddressSpace, + Launch + "_param_" + std::to_string(Index), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + insertStoreParameter(ArrayTy, Parameters, Param, Index); + Index++; + } + + if (Runtime == GPURuntime::OpenCL) { + for (int i = 0; i < NumArgs; i++) { + Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); + Instruction *Param = + new AllocaInst(Builder.getInt32Ty(), AddressSpace, + Launch + "_param_size_" + std::to_string(i), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + insertStoreParameter(ArrayTy, Parameters, Param, Index); + Index++; + } + } + + auto Location = EntryBlock->getTerminator(); + return new BitCastInst(Parameters, Builder.getInt8PtrTy(), + Launch + "_params_i8ptr", Location); +} + +void GPUNodeBuilder::setupKernelSubtreeFunctions( + SetVector SubtreeFunctions) { + for (auto Fn : SubtreeFunctions) { + const std::string ClonedFnName = Fn->getName().str(); + Function *Clone = GPUModule->getFunction(ClonedFnName); + if (!Clone) + Clone = + Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage, + ClonedFnName, GPUModule.get()); + assert(Clone && "Expected cloned function to be initialized."); + assert(ValueMap.find(Fn) == ValueMap.end() && + "Fn already present in ValueMap"); + ValueMap[Fn] = Clone; + } +} +void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { + isl_id *Id = isl_ast_node_get_annotation(KernelStmt); + ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); + isl_id_free(Id); + isl_ast_node_free(KernelStmt); + + if (Kernel->n_grid > 1) + DeepestParallel = std::max( + DeepestParallel, (unsigned)isl_space_dim(Kernel->space, isl_dim_set)); + else + DeepestSequential = std::max( + DeepestSequential, (unsigned)isl_space_dim(Kernel->space, isl_dim_set)); + + Value *BlockDimX, *BlockDimY, *BlockDimZ; + std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); + + SetVector SubtreeValues; + SetVector SubtreeFunctions; + SetVector Loops; + isl::space ParamSpace; + std::tie(SubtreeValues, SubtreeFunctions, Loops, ParamSpace) = + getReferencesInKernel(Kernel); + + // Add parameters that appear only in the access function to the kernel + // space. This is important to make sure that all isl_ids are passed as + // parameters to the kernel, even though we may not have all parameters + // in the context to improve compile time. + Kernel->space = isl_space_align_params(Kernel->space, ParamSpace.release()); + + assert(Kernel->tree && "Device AST of kernel node is empty"); + + Instruction &HostInsertPoint = *Builder.GetInsertPoint(); + IslExprBuilder::IDToValueTy HostIDs = IDToValue; + ValueMapT HostValueMap = ValueMap; + BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; + ScalarMap.clear(); + BlockGenerator::EscapeUsersAllocaMapTy HostEscapeMap = EscapeMap; + EscapeMap.clear(); + + // Create for all loops we depend on values that contain the current loop + // iteration. These values are necessary to generate code for SCEVs that + // depend on such loops. As a result we need to pass them to the subfunction. + for (const Loop *L : Loops) { + const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), + SE.getUnknown(Builder.getInt64(1)), + L, SCEV::FlagAnyWrap); + Value *V = generateSCEV(OuterLIV); + OutsideLoopIterations[L] = SE.getUnknown(V); + SubtreeValues.insert(V); + } + + createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); + setupKernelSubtreeFunctions(SubtreeFunctions); + + create(isl_ast_node_copy(Kernel->tree)); + + finalizeKernelArguments(Kernel); + Function *F = Builder.GetInsertBlock()->getParent(); + if (Arch == GPUArch::NVPTX64) + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); + clearDominators(F); + clearScalarEvolution(F); + clearLoops(F); + + IDToValue = HostIDs; + + ValueMap = std::move(HostValueMap); + ScalarMap = std::move(HostScalarMap); + EscapeMap = std::move(HostEscapeMap); + IDToSAI.clear(); + Annotator.resetAlternativeAliasBases(); + for (auto &BasePtr : LocalArrays) + S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); + LocalArrays.clear(); + + std::string ASMString = finalizeKernelFunction(); + Builder.SetInsertPoint(&HostInsertPoint); + Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); + + std::string Name = getKernelFuncName(Kernel->id); + Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); + Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); + Value *GPUKernel = createCallGetKernel(KernelString, NameString); + + Value *GridDimX, *GridDimY; + std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); + + createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters); + createCallFreeKernel(GPUKernel); + + for (auto Id : KernelIds) + isl_id_free(Id); + + KernelIds.clear(); +} + +/// Compute the DataLayout string for the NVPTX backend. +/// +/// @param is64Bit Are we looking for a 64 bit architecture? +static std::string computeNVPTXDataLayout(bool is64Bit) { + std::string Ret = ""; + + if (!is64Bit) { + Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" + "64-v128:128:128-n16:32:64"; + } else { + Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" + "64-v128:128:128-n16:32:64"; + } + + return Ret; +} + +/// Compute the DataLayout string for a SPIR kernel. +/// +/// @param is64Bit Are we looking for a 64 bit architecture? +static std::string computeSPIRDataLayout(bool is64Bit) { + std::string Ret = ""; + + if (!is64Bit) { + Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } else { + Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } + + return Ret; +} + +Function * +GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, + SetVector &SubtreeValues) { + std::vector Args; + std::string Identifier = getKernelFuncName(Kernel->id); + + std::vector MemoryType; + + for (long i = 0; i < Prog->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + if (gpu_array_is_read_only_scalar(&Prog->array[i])) { + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); + Args.push_back(SAI->getElementType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } else { + static const int UseGlobalMemory = 1; + Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1))); + } + } + + int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); + + for (long i = 0; i < NumHostIters; i++) { + Args.push_back(Builder.getInt64Ty()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } + + int NumVars = isl_space_dim(Kernel->space, isl_dim_param); + + for (long i = 0; i < NumVars; i++) { + isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); + Value *Val = IDToValue[Id]; + isl_id_free(Id); + Args.push_back(Val->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } + + for (auto *V : SubtreeValues) { + Args.push_back(V->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } + + auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); + auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, + GPUModule.get()); + + std::vector EmptyStrings; + + for (unsigned int i = 0; i < MemoryType.size(); i++) { + EmptyStrings.push_back(MDString::get(FN->getContext(), "")); + } + + if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { + FN->setMetadata("kernel_arg_addr_space", + MDNode::get(FN->getContext(), MemoryType)); + FN->setMetadata("kernel_arg_name", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_access_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_base_type", + MDNode::get(FN->getContext(), EmptyStrings)); + } + + switch (Arch) { + case GPUArch::NVPTX64: + FN->setCallingConv(CallingConv::PTX_Kernel); + break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + FN->setCallingConv(CallingConv::SPIR_KERNEL); + break; + } + + auto Arg = FN->arg_begin(); + for (long i = 0; i < Kernel->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + Arg->setName(Kernel->array[i].array->name); + + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id)); + Type *EleTy = SAI->getElementType(); + Value *Val = &*Arg; + SmallVector Sizes; + isl_ast_build *Build = + isl_ast_build_from_context(isl_set_copy(Prog->context)); + Sizes.push_back(nullptr); + for (long j = 1, n = Kernel->array[i].array->n_index; j < n; j++) { + isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( + Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j)); + auto V = ExprBuilder.create(DimSize); + Sizes.push_back(SE.getSCEV(V)); + } + const ScopArrayInfo *SAIRep = + S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); + LocalArrays.push_back(Val); + + isl_ast_build_free(Build); + KernelIds.push_back(Id); + IDToSAI[Id] = SAIRep; + Arg++; + } + + for (long i = 0; i < NumHostIters; i++) { + isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); + Arg->setName(isl_id_get_name(Id)); + IDToValue[Id] = &*Arg; + KernelIDs.insert(std::unique_ptr(Id)); + Arg++; + } + + for (long i = 0; i < NumVars; i++) { + isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); + Arg->setName(isl_id_get_name(Id)); + Value *Val = IDToValue[Id]; + ValueMap[Val] = &*Arg; + IDToValue[Id] = &*Arg; + KernelIDs.insert(std::unique_ptr(Id)); + Arg++; + } + + for (auto *V : SubtreeValues) { + Arg->setName(V->getName()); + ValueMap[V] = &*Arg; + Arg++; + } + + return FN; +} + +void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { + Intrinsic::ID IntrinsicsBID[2]; + Intrinsic::ID IntrinsicsTID[3]; + + switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + llvm_unreachable("Cannot generate NVVM intrinsics for SPIR"); + case GPUArch::NVPTX64: + IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; + IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; + + IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; + IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; + IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; + break; + } + + auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { + std::string Name = isl_id_get_name(Id); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); + Value *Val = Builder.CreateCall(IntrinsicFn, {}); + Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); + IDToValue[Id] = Val; + KernelIDs.insert(std::unique_ptr(Id)); + }; + + for (int i = 0; i < Kernel->n_grid; ++i) { + isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); + addId(Id, IntrinsicsBID[i]); + } + + for (int i = 0; i < Kernel->n_block; ++i) { + isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); + addId(Id, IntrinsicsTID[i]); + } +} + +void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel, + bool SizeTypeIs64bit) { + const char *GroupName[3] = {"__gen_ocl_get_group_id0", + "__gen_ocl_get_group_id1", + "__gen_ocl_get_group_id2"}; + + const char *LocalName[3] = {"__gen_ocl_get_local_id0", + "__gen_ocl_get_local_id1", + "__gen_ocl_get_local_id2"}; + IntegerType *SizeT = + SizeTypeIs64bit ? Builder.getInt64Ty() : Builder.getInt32Ty(); + + auto createFunc = [this](const char *Name, __isl_take isl_id *Id, + IntegerType *SizeT) mutable { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *FN = M->getFunction(Name); + + // If FN is not available, declare it. + if (!FN) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(SizeT, Args, false); + FN = Function::Create(Ty, Linkage, Name, M); + FN->setCallingConv(CallingConv::SPIR_FUNC); + } + + Value *Val = Builder.CreateCall(FN, {}); + if (SizeT == Builder.getInt32Ty()) + Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); + IDToValue[Id] = Val; + KernelIDs.insert(std::unique_ptr(Id)); + }; + + for (int i = 0; i < Kernel->n_grid; ++i) + createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i), SizeT); + + for (int i = 0; i < Kernel->n_block; ++i) + createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i), SizeT); +} + +void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { + auto Arg = FN->arg_begin(); + for (long i = 0; i < Kernel->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id)); + isl_id_free(Id); + + if (SAI->getNumberOfDimensions() > 0) { + Arg++; + continue; + } + + Value *Val = &*Arg; + + if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { + Type *TypePtr = SAI->getElementType()->getPointerTo(); + Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); + Val = Builder.CreateLoad(SAI->getElementType(), TypedArgPtr); + } + + Value *Alloca = BlockGen.getOrCreateAlloca(SAI); + Builder.CreateStore(Val, Alloca); + + Arg++; + } +} + +void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { + auto *FN = Builder.GetInsertBlock()->getParent(); + auto Arg = FN->arg_begin(); + + bool StoredScalar = false; + for (long i = 0; i < Kernel->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id)); + isl_id_free(Id); + + if (SAI->getNumberOfDimensions() > 0) { + Arg++; + continue; + } + + if (gpu_array_is_read_only_scalar(&Prog->array[i])) { + Arg++; + continue; + } + + Value *Alloca = BlockGen.getOrCreateAlloca(SAI); + Value *ArgPtr = &*Arg; + Type *TypePtr = SAI->getElementType()->getPointerTo(); + Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); + Value *Val = Builder.CreateLoad(SAI->getElementType(), Alloca); + Builder.CreateStore(Val, TypedArgPtr); + StoredScalar = true; + + Arg++; + } + + if (StoredScalar) { + /// In case more than one thread contains scalar stores, the generated + /// code might be incorrect, if we only store at the end of the kernel. + /// To support this case we need to store these scalars back at each + /// memory store or at least before each kernel barrier. + if (Kernel->n_block != 0 || Kernel->n_grid != 0) { + BuildSuccessful = 0; + LLVM_DEBUG( + dbgs() << getUniqueScopName(&S) + << " has a store to a scalar value that" + " would be undefined to run in parallel. Bailing out.\n";); + } + } +} + +void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + + for (int i = 0; i < Kernel->n_var; ++i) { + struct ppcg_kernel_var &Var = Kernel->var[i]; + isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); + Type *EleTy = ScopArrayInfo::getFromId(isl::manage(Id))->getElementType(); + + Type *ArrayTy = EleTy; + SmallVector Sizes; + + Sizes.push_back(nullptr); + for (unsigned int j = 1; j < Var.array->n_index; ++j) { + isl_val *Val = isl_vec_get_element_val(Var.size, j); + long Bound = isl_val_get_num_si(Val); + isl_val_free(Val); + Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); + } + + for (int j = Var.array->n_index - 1; j >= 0; --j) { + isl_val *Val = isl_vec_get_element_val(Var.size, j); + long Bound = isl_val_get_num_si(Val); + isl_val_free(Val); + ArrayTy = ArrayType::get(ArrayTy, Bound); + } + + const ScopArrayInfo *SAI; + Value *Allocation; + if (Var.type == ppcg_access_shared) { + auto GlobalVar = new GlobalVariable( + *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, + nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); + GlobalVar->setAlignment(llvm::Align(EleTy->getPrimitiveSizeInBits() / 8)); + GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); + + Allocation = GlobalVar; + } else if (Var.type == ppcg_access_private) { + Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); + } else { + llvm_unreachable("unknown variable type"); + } + SAI = + S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); + Id = isl_id_alloc(S.getIslCtx().get(), Var.name, nullptr); + IDToValue[Id] = Allocation; + LocalArrays.push_back(Allocation); + KernelIds.push_back(Id); + IDToSAI[Id] = SAI; + } +} + +void GPUNodeBuilder::createKernelFunction( + ppcg_kernel *Kernel, SetVector &SubtreeValues, + SetVector &SubtreeFunctions) { + std::string Identifier = getKernelFuncName(Kernel->id); + GPUModule.reset(new Module(Identifier, Builder.getContext())); + + switch (Arch) { + case GPUArch::NVPTX64: + if (Runtime == GPURuntime::CUDA) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + else if (Runtime == GPURuntime::OpenCL) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); + GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); + break; + case GPUArch::SPIR32: + GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); + break; + case GPUArch::SPIR64: + GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */)); + break; + } + + Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); + + BasicBlock *PrevBlock = Builder.GetInsertBlock(); + auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); + + DT.addNewBlock(EntryBlock, PrevBlock); + + Builder.SetInsertPoint(EntryBlock); + Builder.CreateRetVoid(); + Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); + + ScopDetection::markFunctionAsInvalid(FN); + + prepareKernelArguments(Kernel, FN); + createKernelVariables(Kernel, FN); + + switch (Arch) { + case GPUArch::NVPTX64: + insertKernelIntrinsics(Kernel); + break; + case GPUArch::SPIR32: + insertKernelCallsSPIR(Kernel, false); + break; + case GPUArch::SPIR64: + insertKernelCallsSPIR(Kernel, true); + break; + } +} + +std::string GPUNodeBuilder::createKernelASM() { + llvm::Triple GPUTriple; + + switch (Arch) { + case GPUArch::NVPTX64: + switch (Runtime) { + case GPURuntime::CUDA: + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); + break; + case GPURuntime::OpenCL: + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); + break; + } + break; + case GPUArch::SPIR64: + case GPUArch::SPIR32: + std::string SPIRAssembly; + raw_string_ostream IROstream(SPIRAssembly); + IROstream << *GPUModule; + IROstream.flush(); + return SPIRAssembly; + } + + std::string ErrMsg; + auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); + + if (!GPUTarget) { + errs() << ErrMsg << "\n"; + return ""; + } + + TargetOptions Options; + Options.UnsafeFPMath = FastMath; + + std::string subtarget; + + switch (Arch) { + case GPUArch::NVPTX64: + subtarget = CudaVersion; + break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + llvm_unreachable("No subtarget for SPIR architecture"); + } + + std::unique_ptr TargetM(GPUTarget->createTargetMachine( + GPUTriple.getTriple(), subtarget, "", Options, std::nullopt)); + + SmallString<0> ASMString; + raw_svector_ostream ASMStream(ASMString); + llvm::legacy::PassManager PM; + + PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); + + if (TargetM->addPassesToEmitFile(PM, ASMStream, nullptr, CGFT_AssemblyFile, + true /* verify */)) { + errs() << "The target does not support generation of this file type!\n"; + return ""; + } + + PM.run(*GPUModule); + + return ASMStream.str().str(); +} + +bool GPUNodeBuilder::requiresCUDALibDevice() { + bool RequiresLibDevice = false; + for (Function &F : GPUModule->functions()) { + if (!F.isDeclaration()) + continue; + + const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(F.getName()); + if (CUDALibDeviceFunc.length() != 0) { + // We need to handle the case where a module looks like this: + // @expf(..) + // @llvm.exp.f64(..) + // Both of these functions would be renamed to `__nv_expf`. + // + // So, we must first check for the existence of the libdevice function. + // If this exists, we replace our current function with it. + // + // If it does not exist, we rename the current function to the + // libdevice functiono name. + if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc)) + F.replaceAllUsesWith(Replacement); + else + F.setName(CUDALibDeviceFunc); + RequiresLibDevice = true; + } + } + + return RequiresLibDevice; +} + +void GPUNodeBuilder::addCUDALibDevice() { + if (Arch != GPUArch::NVPTX64) + return; + + if (requiresCUDALibDevice()) { + SMDiagnostic Error; + + errs() << CUDALibDevice << "\n"; + auto LibDeviceModule = + parseIRFile(CUDALibDevice, Error, GPUModule->getContext()); + + if (!LibDeviceModule) { + BuildSuccessful = false; + report_fatal_error("Could not find or load libdevice. Skipping GPU " + "kernel generation. Please set -polly-acc-libdevice " + "accordingly.\n"); + return; + } + + Linker L(*GPUModule); + + // Set an nvptx64 target triple to avoid linker warnings. The original + // triple of the libdevice files are nvptx-unknown-unknown. + LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded); + } +} + +std::string GPUNodeBuilder::finalizeKernelFunction() { + + if (verifyModule(*GPUModule)) { + LLVM_DEBUG(dbgs() << "verifyModule failed on module:\n"; + GPUModule->print(dbgs(), nullptr); dbgs() << "\n";); + LLVM_DEBUG(dbgs() << "verifyModule Error:\n"; + verifyModule(*GPUModule, &dbgs());); + + if (FailOnVerifyModuleFailure) + llvm_unreachable("VerifyModule failed."); + + BuildSuccessful = false; + return ""; + } + + addCUDALibDevice(); + + if (DumpKernelIR) + outs() << *GPUModule << "\n"; + + if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) { + // Optimize module. + llvm::legacy::PassManager OptPasses; + PassManagerBuilder PassBuilder; + PassBuilder.OptLevel = 3; + PassBuilder.SizeLevel = 0; + PassBuilder.populateModulePassManager(OptPasses); + OptPasses.run(*GPUModule); + } + + std::string Assembly = createKernelASM(); + + if (DumpKernelASM) + outs() << Assembly << "\n"; + + GPUModule.release(); + KernelIDs.clear(); + + return Assembly; +} +/// Construct an `isl_pw_aff_list` from a vector of `isl_pw_aff` +/// @param PwAffs The list of piecewise affine functions to create an +/// `isl_pw_aff_list` from. We expect an rvalue ref because +/// all the isl_pw_aff are used up by this function. +/// +/// @returns The `isl_pw_aff_list`. +__isl_give isl_pw_aff_list * +createPwAffList(isl_ctx *Context, + const std::vector<__isl_take isl_pw_aff *> &&PwAffs) { + isl_pw_aff_list *List = isl_pw_aff_list_alloc(Context, PwAffs.size()); + + for (unsigned i = 0; i < PwAffs.size(); i++) { + List = isl_pw_aff_list_insert(List, i, PwAffs[i]); + } + return List; +} + +/// Align all the `PwAffs` such that they have the same parameter dimensions. +/// +/// We loop over all `pw_aff` and align all of their spaces together to +/// create a common space for all the `pw_aff`. This common space is the +/// `AlignSpace`. We then align all the `pw_aff` to this space. We start +/// with the given `SeedSpace`. +/// @param PwAffs The list of piecewise affine functions we want to align. +/// This is an rvalue reference because the entire vector is +/// used up by the end of the operation. +/// @param SeedSpace The space to start the alignment process with. +/// @returns A std::pair, whose first element is the aligned space, +/// whose second element is the vector of aligned piecewise +/// affines. +static std::pair<__isl_give isl_space *, std::vector<__isl_give isl_pw_aff *>> +alignPwAffs(const std::vector<__isl_take isl_pw_aff *> &&PwAffs, + __isl_take isl_space *SeedSpace) { + assert(SeedSpace && "Invalid seed space given."); + + isl_space *AlignSpace = SeedSpace; + for (isl_pw_aff *PwAff : PwAffs) { + isl_space *PwAffSpace = isl_pw_aff_get_domain_space(PwAff); + AlignSpace = isl_space_align_params(AlignSpace, PwAffSpace); + } + std::vector AdjustedPwAffs; + + for (unsigned i = 0; i < PwAffs.size(); i++) { + isl_pw_aff *Adjusted = PwAffs[i]; + assert(Adjusted && "Invalid pw_aff given."); + Adjusted = isl_pw_aff_align_params(Adjusted, isl_space_copy(AlignSpace)); + AdjustedPwAffs.push_back(Adjusted); + } + return std::make_pair(AlignSpace, AdjustedPwAffs); +} + +namespace { +class PPCGCodeGeneration final : public ScopPass { +public: + static char ID; + + GPURuntime Runtime = GPURuntime::CUDA; + + GPUArch Architecture = GPUArch::NVPTX64; + + /// The scop that is currently processed. + Scop *S; + + LoopInfo *LI; + DominatorTree *DT; + ScalarEvolution *SE; + const DataLayout *DL; + RegionInfo *RI; + + PPCGCodeGeneration() : ScopPass(ID) { + // Apply defaults. + Runtime = GPURuntimeChoice; + Architecture = GPUArchChoice; + } + + /// Construct compilation options for PPCG. + /// + /// @returns The compilation options. + ppcg_options *createPPCGOptions() { + auto DebugOptions = + (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); + auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); + + DebugOptions->dump_schedule_constraints = false; + DebugOptions->dump_schedule = false; + DebugOptions->dump_final_schedule = false; + DebugOptions->dump_sizes = false; + DebugOptions->verbose = false; + + Options->debug = DebugOptions; + + Options->group_chains = false; + Options->reschedule = true; + Options->scale_tile_loops = false; + Options->wrap = false; + + Options->non_negative_parameters = false; + Options->ctx = nullptr; + Options->sizes = nullptr; + + Options->tile = true; + Options->tile_size = 32; + + Options->isolate_full_tiles = false; + + Options->use_private_memory = PrivateMemory; + Options->use_shared_memory = SharedMemory; + Options->max_shared_memory = 48 * 1024; + + Options->target = PPCG_TARGET_CUDA; + Options->openmp = false; + Options->linearize_device_arrays = true; + Options->allow_gnu_extensions = false; + + Options->unroll_copy_shared = false; + Options->unroll_gpu_tile = false; + Options->live_range_reordering = true; + + Options->live_range_reordering = true; + Options->hybrid = false; + Options->opencl_compiler_options = nullptr; + Options->opencl_use_gpu = false; + Options->opencl_n_include_file = 0; + Options->opencl_include_files = nullptr; + Options->opencl_print_kernel_types = false; + Options->opencl_embed_kernel_code = false; + + Options->save_schedule_file = nullptr; + Options->load_schedule_file = nullptr; + + return Options; + } + + /// Get a tagged access relation containing all accesses of type @p AccessTy. + /// + /// Instead of a normal access of the form: + /// + /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] + /// + /// a tagged access has the form + /// + /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] + /// + /// where 'id' is an additional space that references the memory access that + /// triggered the access. + /// + /// @param AccessTy The type of the memory accesses to collect. + /// + /// @return The relation describing all tagged memory accesses. + isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { + isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace().release()); + + for (auto &Stmt : *S) + for (auto &Acc : Stmt) + if (Acc->getType() == AccessTy) { + isl_map *Relation = Acc->getAccessRelation().release(); + Relation = + isl_map_intersect_domain(Relation, Stmt.getDomain().release()); + + isl_space *Space = isl_map_get_space(Relation); + Space = isl_space_range(Space); + Space = isl_space_from_range(Space); + Space = + isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release()); + isl_map *Universe = isl_map_universe(Space); + Relation = isl_map_domain_product(Relation, Universe); + Accesses = isl_union_map_add_map(Accesses, Relation); + } + + return Accesses; + } + + /// Get the set of all read accesses, tagged with the access id. + /// + /// @see getTaggedAccesses + isl_union_map *getTaggedReads() { + return getTaggedAccesses(MemoryAccess::READ); + } + + /// Get the set of all may (and must) accesses, tagged with the access id. + /// + /// @see getTaggedAccesses + isl_union_map *getTaggedMayWrites() { + return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), + getTaggedAccesses(MemoryAccess::MUST_WRITE)); + } + + /// Get the set of all must accesses, tagged with the access id. + /// + /// @see getTaggedAccesses + isl_union_map *getTaggedMustWrites() { + return getTaggedAccesses(MemoryAccess::MUST_WRITE); + } + + /// Collect parameter and array names as isl_ids. + /// + /// To reason about the different parameters and arrays used, ppcg requires + /// a list of all isl_ids in use. As PPCG traditionally performs + /// source-to-source compilation each of these isl_ids is mapped to the + /// expression that represents it. As we do not have a corresponding + /// expression in Polly, we just map each id to a 'zero' expression to match + /// the data format that ppcg expects. + /// + /// @returns Retun a map from collected ids to 'zero' ast expressions. + __isl_give isl_id_to_ast_expr *getNames() { + auto *Names = isl_id_to_ast_expr_alloc( + S->getIslCtx().get(), + S->getNumParams() + std::distance(S->array_begin(), S->array_end())); + auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx().get())); + + for (const SCEV *P : S->parameters()) { + isl_id *Id = S->getIdForParam(P).release(); + Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); + } + + for (auto &Array : S->arrays()) { + auto Id = Array->getBasePtrId().release(); + Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); + } + + isl_ast_expr_free(Zero); + + return Names; + } + + /// Create a new PPCG scop from the current scop. + /// + /// The PPCG scop is initialized with data from the current polly::Scop. From + /// this initial data, the data-dependences in the PPCG scop are initialized. + /// We do not use Polly's dependence analysis for now, to ensure we match + /// the PPCG default behaviour more closely. + /// + /// @returns A new ppcg scop. + ppcg_scop *createPPCGScop() { + MustKillsInfo KillsInfo = computeMustKillsInfo(*S); + + auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); + + PPCGScop->options = createPPCGOptions(); + // enable live range reordering + PPCGScop->options->live_range_reordering = 1; + + PPCGScop->start = 0; + PPCGScop->end = 0; + + PPCGScop->context = S->getContext().release(); + PPCGScop->domain = S->getDomains().release(); + // TODO: investigate this further. PPCG calls collect_call_domains. + PPCGScop->call = isl_union_set_from_set(S->getContext().release()); + PPCGScop->tagged_reads = getTaggedReads(); + PPCGScop->reads = S->getReads().release(); + PPCGScop->live_in = nullptr; + PPCGScop->tagged_may_writes = getTaggedMayWrites(); + PPCGScop->may_writes = S->getWrites().release(); + PPCGScop->tagged_must_writes = getTaggedMustWrites(); + PPCGScop->must_writes = S->getMustWrites().release(); + PPCGScop->live_out = nullptr; + PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.release(); + PPCGScop->must_kills = KillsInfo.MustKills.release(); + + PPCGScop->tagger = nullptr; + PPCGScop->independence = + isl_union_map_empty(isl_set_get_space(PPCGScop->context)); + PPCGScop->dep_flow = nullptr; + PPCGScop->tagged_dep_flow = nullptr; + PPCGScop->dep_false = nullptr; + PPCGScop->dep_forced = nullptr; + PPCGScop->dep_order = nullptr; + PPCGScop->tagged_dep_order = nullptr; + + PPCGScop->schedule = S->getScheduleTree().release(); + // If we have something non-trivial to kill, add it to the schedule + if (KillsInfo.KillsSchedule.get()) + PPCGScop->schedule = isl_schedule_sequence( + PPCGScop->schedule, KillsInfo.KillsSchedule.release()); + + PPCGScop->names = getNames(); + PPCGScop->pet = nullptr; + + compute_tagger(PPCGScop); + compute_dependences(PPCGScop); + eliminate_dead_code(PPCGScop); + + return PPCGScop; + } + + /// Collect the array accesses in a statement. + /// + /// @param Stmt The statement for which to collect the accesses. + /// + /// @returns A list of array accesses. + gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { + gpu_stmt_access *Accesses = nullptr; + + for (MemoryAccess *Acc : Stmt) { + auto Access = + isl_alloc_type(S->getIslCtx().get(), struct gpu_stmt_access); + Access->read = Acc->isRead(); + Access->write = Acc->isWrite(); + Access->access = Acc->getAccessRelation().release(); + isl_space *Space = isl_map_get_space(Access->access); + Space = isl_space_range(Space); + Space = isl_space_from_range(Space); + Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release()); + isl_map *Universe = isl_map_universe(Space); + Access->tagged_access = + isl_map_domain_product(Acc->getAccessRelation().release(), Universe); + Access->exact_write = !Acc->isMayWrite(); + Access->ref_id = Acc->getId().release(); + Access->next = Accesses; + Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); + // TODO: Also mark one-element accesses to arrays as fixed-element. + Access->fixed_element = + Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false; + Accesses = Access; + } + + return Accesses; + } + + /// Collect the list of GPU statements. + /// + /// Each statement has an id, a pointer to the underlying data structure, + /// as well as a list with all memory accesses. + /// + /// TODO: Initialize the list of memory accesses. + /// + /// @returns A linked-list of statements. + gpu_stmt *getStatements() { + gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx().get(), struct gpu_stmt, + std::distance(S->begin(), S->end())); + + int i = 0; + for (auto &Stmt : *S) { + gpu_stmt *GPUStmt = &Stmts[i]; + + GPUStmt->id = Stmt.getDomainId().release(); + + // We use the pet stmt pointer to keep track of the Polly statements. + GPUStmt->stmt = (pet_stmt *)&Stmt; + GPUStmt->accesses = getStmtAccesses(Stmt); + i++; + } + + return Stmts; + } + + /// Derive the extent of an array. + /// + /// The extent of an array is the set of elements that are within the + /// accessed array. For the inner dimensions, the extent constraints are + /// 0 and the size of the corresponding array dimension. For the first + /// (outermost) dimension, the extent constraints are the minimal and maximal + /// subscript value for the first dimension. + /// + /// @param Array The array to derive the extent for. + /// + /// @returns An isl_set describing the extent of the array. + isl::set getExtent(ScopArrayInfo *Array) { + unsigned NumDims = Array->getNumberOfDimensions(); + + if (Array->getNumberOfDimensions() == 0) + return isl::set::universe(Array->getSpace()); + + isl::union_map Accesses = S->getAccesses(Array); + isl::union_set AccessUSet = Accesses.range(); + AccessUSet = AccessUSet.coalesce(); + AccessUSet = AccessUSet.detect_equalities(); + AccessUSet = AccessUSet.coalesce(); + + if (AccessUSet.is_empty()) + return isl::set::empty(Array->getSpace()); + + isl::set AccessSet = AccessUSet.extract_set(Array->getSpace()); + + isl::local_space LS = isl::local_space(Array->getSpace()); + + isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, 0); + isl::pw_aff OuterMin = AccessSet.dim_min(0); + isl::pw_aff OuterMax = AccessSet.dim_max(0); + OuterMin = OuterMin.add_dims(isl::dim::in, + unsignedFromIslSize(Val.dim(isl::dim::in))); + OuterMax = OuterMax.add_dims(isl::dim::in, + unsignedFromIslSize(Val.dim(isl::dim::in))); + OuterMin = OuterMin.set_tuple_id(isl::dim::in, Array->getBasePtrId()); + OuterMax = OuterMax.set_tuple_id(isl::dim::in, Array->getBasePtrId()); + + isl::set Extent = isl::set::universe(Array->getSpace()); + + Extent = Extent.intersect(OuterMin.le_set(Val)); + Extent = Extent.intersect(OuterMax.ge_set(Val)); + + for (unsigned i = 1; i < NumDims; ++i) + Extent = Extent.lower_bound_si(isl::dim::set, i, 0); + + for (unsigned i = 0; i < NumDims; ++i) { + isl::pw_aff PwAff = Array->getDimensionSizePw(i); + + // isl_pw_aff can be NULL for zero dimension. Only in the case of a + // Fortran array will we have a legitimate dimension. + if (PwAff.is_null()) { + assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension"); + continue; + } + + isl::pw_aff Val = isl::aff::var_on_domain( + isl::local_space(Array->getSpace()), isl::dim::set, i); + PwAff = PwAff.add_dims(isl::dim::in, + unsignedFromIslSize(Val.dim(isl::dim::in))); + PwAff = PwAff.set_tuple_id(isl::dim::in, Val.get_tuple_id(isl::dim::in)); + isl::set Set = PwAff.gt_set(Val); + Extent = Set.intersect(Extent); + } + + return Extent; + } + + /// Derive the bounds of an array. + /// + /// For the first dimension we derive the bound of the array from the extent + /// of this dimension. For inner dimensions we obtain their size directly from + /// ScopArrayInfo. + /// + /// @param PPCGArray The array to compute bounds for. + /// @param Array The polly array from which to take the information. + void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { + std::vector Bounds; + + if (PPCGArray.n_index > 0) { + if (isl_set_is_empty(PPCGArray.extent)) { + isl_set *Dom = isl_set_copy(PPCGArray.extent); + isl_local_space *LS = isl_local_space_from_space( + isl_space_params(isl_set_get_space(Dom))); + isl_set_free(Dom); + isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS)); + Bounds.push_back(Zero); + } else { + isl_set *Dom = isl_set_copy(PPCGArray.extent); + Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); + isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); + isl_set_free(Dom); + Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); + isl_local_space *LS = + isl_local_space_from_space(isl_set_get_space(Dom)); + isl_aff *One = isl_aff_zero_on_domain(LS); + One = isl_aff_add_constant_si(One, 1); + Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); + Bound = isl_pw_aff_gist(Bound, S->getContext().release()); + Bounds.push_back(Bound); + } + } + + for (unsigned i = 1; i < PPCGArray.n_index; ++i) { + isl_pw_aff *Bound = Array->getDimensionSizePw(i).release(); + auto LS = isl_pw_aff_get_domain_space(Bound); + auto Aff = isl_multi_aff_zero(LS); + + // We need types to work out, which is why we perform this weird dance + // with `Aff` and `Bound`. Consider this example: + + // LS: [p] -> { [] } + // Zero: [p] -> { [] } | Implicitly, is [p] -> { ~ -> [] }. + // This `~` is used to denote a "null space" (which is different from + // a *zero dimensional* space), which is something that ISL does not + // show you when pretty printing. + + // Bound: [p] -> { [] -> [(10p)] } | Here, the [] is a *zero dimensional* + // space, not a "null space" which does not exist at all. + + // When we pullback (precompose) `Bound` with `Zero`, we get: + // Bound . Zero = + // ([p] -> { [] -> [(10p)] }) . ([p] -> {~ -> [] }) = + // [p] -> { ~ -> [(10p)] } = + // [p] -> [(10p)] (as ISL pretty prints it) + // Bound Pullback: [p] -> { [(10p)] } + + // We want this kind of an expression for Bound, without a + // zero dimensional input, but with a "null space" input for the types + // to work out later on, as far as I (Siddharth Bhat) understand. + // I was unable to find a reference to this in the ISL manual. + // References: Tobias Grosser. + + Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); + Bounds.push_back(Bound); + } + + /// To construct a `isl_multi_pw_aff`, we need all the indivisual `pw_aff` + /// to have the same parameter dimensions. So, we need to align them to an + /// appropriate space. + /// Scop::Context is _not_ an appropriate space, because when we have + /// `-polly-ignore-parameter-bounds` enabled, the Scop::Context does not + /// contain all parameter dimensions. + /// So, use the helper `alignPwAffs` to align all the `isl_pw_aff` together. + isl_space *SeedAlignSpace = S->getParamSpace().release(); + SeedAlignSpace = isl_space_add_dims(SeedAlignSpace, isl_dim_set, 1); + + isl_space *AlignSpace = nullptr; + std::vector AlignedBounds; + std::tie(AlignSpace, AlignedBounds) = + alignPwAffs(std::move(Bounds), SeedAlignSpace); + + assert(AlignSpace && "alignPwAffs did not initialise AlignSpace"); + + isl_pw_aff_list *BoundsList = + createPwAffList(S->getIslCtx().get(), std::move(AlignedBounds)); + + isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent); + BoundsSpace = isl_space_align_params(BoundsSpace, AlignSpace); + + assert(BoundsSpace && "Unable to access space of array."); + assert(BoundsList && "Unable to access list of bounds."); + + PPCGArray.bound = + isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList); + assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly."); + } + + /// Create the arrays for @p PPCGProg. + /// + /// @param PPCGProg The program to compute the arrays for. + void createArrays(gpu_prog *PPCGProg, + const SmallVector &ValidSAIs) { + int i = 0; + for (auto &Array : ValidSAIs) { + std::string TypeName; + raw_string_ostream OS(TypeName); + + OS << *Array->getElementType(); + TypeName = OS.str(); + + gpu_array_info &PPCGArray = PPCGProg->array[i]; + + PPCGArray.space = Array->getSpace().release(); + PPCGArray.type = strdup(TypeName.c_str()); + PPCGArray.size = DL->getTypeAllocSize(Array->getElementType()); + PPCGArray.name = strdup(Array->getName().c_str()); + PPCGArray.extent = nullptr; + PPCGArray.n_index = Array->getNumberOfDimensions(); + PPCGArray.extent = getExtent(Array).release(); + PPCGArray.n_ref = 0; + PPCGArray.refs = nullptr; + PPCGArray.accessed = true; + PPCGArray.read_only_scalar = + Array->isReadOnly() && Array->getNumberOfDimensions() == 0; + PPCGArray.has_compound_element = false; + PPCGArray.local = false; + PPCGArray.declare_local = false; + PPCGArray.global = false; + PPCGArray.linearize = false; + PPCGArray.dep_order = nullptr; + PPCGArray.user = Array; + + PPCGArray.bound = nullptr; + setArrayBounds(PPCGArray, Array); + i++; + + collect_references(PPCGProg, &PPCGArray); + PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray); + } + } + + /// Create an identity map between the arrays in the scop. + /// + /// @returns An identity map between the arrays in the scop. + isl_union_map *getArrayIdentity() { + isl_union_map *Maps = isl_union_map_empty(S->getParamSpace().release()); + + for (auto &Array : S->arrays()) { + isl_space *Space = Array->getSpace().release(); + Space = isl_space_map_from_set(Space); + isl_map *Identity = isl_map_identity(Space); + Maps = isl_union_map_add_map(Maps, Identity); + } + + return Maps; + } + + /// Create a default-initialized PPCG GPU program. + /// + /// @returns A new gpu program description. + gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { + + if (!PPCGScop) + return nullptr; + + auto PPCGProg = isl_calloc_type(S->getIslCtx().get(), struct gpu_prog); + + PPCGProg->ctx = S->getIslCtx().get(); + PPCGProg->scop = PPCGScop; + PPCGProg->context = isl_set_copy(PPCGScop->context); + PPCGProg->read = isl_union_map_copy(PPCGScop->reads); + PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); + PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); + PPCGProg->tagged_must_kill = + isl_union_map_copy(PPCGScop->tagged_must_kills); + PPCGProg->to_inner = getArrayIdentity(); + PPCGProg->to_outer = getArrayIdentity(); + // TODO: verify that this assignment is correct. + PPCGProg->any_to_outer = nullptr; + PPCGProg->n_stmts = std::distance(S->begin(), S->end()); + PPCGProg->stmts = getStatements(); + + // Only consider arrays that have a non-empty extent. + // Otherwise, this will cause us to consider the following kinds of + // empty arrays: + // 1. Invariant loads that are represented by SAI objects. + // 2. Arrays with statically known zero size. + auto ValidSAIsRange = + make_filter_range(S->arrays(), [this](ScopArrayInfo *SAI) -> bool { + return !getExtent(SAI).is_empty(); + }); + SmallVector ValidSAIs(ValidSAIsRange.begin(), + ValidSAIsRange.end()); + + PPCGProg->n_array = + ValidSAIs.size(); // std::distance(S->array_begin(), S->array_end()); + PPCGProg->array = isl_calloc_array( + S->getIslCtx().get(), struct gpu_array_info, PPCGProg->n_array); + + createArrays(PPCGProg, ValidSAIs); + + PPCGProg->array_order = nullptr; + collect_order_dependences(PPCGProg); + + PPCGProg->may_persist = compute_may_persist(PPCGProg); + return PPCGProg; + } + + struct PrintGPUUserData { + struct cuda_info *CudaInfo; + struct gpu_prog *PPCGProg; + std::vector Kernels; + }; + + /// Print a user statement node in the host code. + /// + /// We use ppcg's printing facilities to print the actual statement and + /// additionally build up a list of all kernels that are encountered in the + /// host ast. + /// + /// @param P The printer to print to + /// @param Options The printing options to use + /// @param Node The node to print + /// @param User A user pointer to carry additional data. This pointer is + /// expected to be of type PrintGPUUserData. + /// + /// @returns A printer to which the output has been printed. + static __isl_give isl_printer * + printHostUser(__isl_take isl_printer *P, + __isl_take isl_ast_print_options *Options, + __isl_take isl_ast_node *Node, void *User) { + auto Data = (struct PrintGPUUserData *)User; + auto Id = isl_ast_node_get_annotation(Node); + + if (Id) { + bool IsUser = !strcmp(isl_id_get_name(Id), "user"); + + // If this is a user statement, format it ourselves as ppcg would + // otherwise try to call pet functionality that is not available in + // Polly. + if (IsUser) { + P = isl_printer_start_line(P); + P = isl_printer_print_ast_node(P, Node); + P = isl_printer_end_line(P); + isl_id_free(Id); + isl_ast_print_options_free(Options); + return P; + } + + auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); + isl_id_free(Id); + Data->Kernels.push_back(Kernel); + } + + return print_host_user(P, Options, Node, User); + } + + /// Print C code corresponding to the control flow in @p Kernel. + /// + /// @param Kernel The kernel to print + void printKernel(ppcg_kernel *Kernel) { + auto *P = isl_printer_to_str(S->getIslCtx().get()); + P = isl_printer_set_output_format(P, ISL_FORMAT_C); + auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get()); + P = isl_ast_node_print(Kernel->tree, P, Options); + char *String = isl_printer_get_str(P); + outs() << String << "\n"; + free(String); + isl_printer_free(P); + } + + /// Print C code corresponding to the GPU code described by @p Tree. + /// + /// @param Tree An AST describing GPU code + /// @param PPCGProg The PPCG program from which @Tree has been constructed. + void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { + auto *P = isl_printer_to_str(S->getIslCtx().get()); + P = isl_printer_set_output_format(P, ISL_FORMAT_C); + + PrintGPUUserData Data; + Data.PPCGProg = PPCGProg; + + auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get()); + Options = + isl_ast_print_options_set_print_user(Options, printHostUser, &Data); + P = isl_ast_node_print(Tree, P, Options); + char *String = isl_printer_get_str(P); + outs() << "# host\n"; + outs() << String << "\n"; + free(String); + isl_printer_free(P); + + for (auto Kernel : Data.Kernels) { + outs() << "# kernel" << Kernel->id << "\n"; + printKernel(Kernel); + } + } + + // Generate a GPU program using PPCG. + // + // GPU mapping consists of multiple steps: + // + // 1) Compute new schedule for the program. + // 2) Map schedule to GPU (TODO) + // 3) Generate code for new schedule (TODO) + // + // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer + // is mostly CPU specific. Instead, we use PPCG's GPU code generation + // strategy directly from this pass. + gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { + + auto PPCGGen = isl_calloc_type(S->getIslCtx().get(), struct gpu_gen); + + PPCGGen->ctx = S->getIslCtx().get(); + PPCGGen->options = PPCGScop->options; + PPCGGen->print = nullptr; + PPCGGen->print_user = nullptr; + PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; + PPCGGen->prog = PPCGProg; + PPCGGen->tree = nullptr; + PPCGGen->types.n = 0; + PPCGGen->types.name = nullptr; + PPCGGen->sizes = nullptr; + PPCGGen->used_sizes = nullptr; + PPCGGen->kernel_id = 0; + + // Set scheduling strategy to same strategy PPCG is using. + isl_options_set_schedule_serialize_sccs(PPCGGen->ctx, false); + isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); + isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); + isl_options_set_schedule_whole_component(PPCGGen->ctx, false); + + isl_schedule *Schedule = get_schedule(PPCGGen); + + int has_permutable = has_any_permutable_node(Schedule); + + Schedule = + isl_schedule_align_params(Schedule, S->getFullParamSpace().release()); + + if (!has_permutable || has_permutable < 0) { + Schedule = isl_schedule_free(Schedule); + LLVM_DEBUG(dbgs() << getUniqueScopName(S) + << " does not have permutable bands. Bailing out\n";); + } else { + const bool CreateTransferToFromDevice = !PollyManagedMemory; + Schedule = map_to_device(PPCGGen, Schedule, CreateTransferToFromDevice); + PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); + } + + if (DumpSchedule) { + isl_printer *P = isl_printer_to_str(S->getIslCtx().get()); + P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); + P = isl_printer_print_str(P, "Schedule\n"); + P = isl_printer_print_str(P, "========\n"); + if (Schedule) + P = isl_printer_print_schedule(P, Schedule); + else + P = isl_printer_print_str(P, "No schedule found\n"); + + outs() << isl_printer_get_str(P) << "\n"; + isl_printer_free(P); + } + + if (DumpCode) { + outs() << "Code\n"; + outs() << "====\n"; + if (PPCGGen->tree) + printGPUTree(PPCGGen->tree, PPCGProg); + else + outs() << "No code generated\n"; + } + + isl_schedule_free(Schedule); + + return PPCGGen; + } + + /// Free gpu_gen structure. + /// + /// @param PPCGGen The ppcg_gen object to free. + void freePPCGGen(gpu_gen *PPCGGen) { + isl_ast_node_free(PPCGGen->tree); + isl_union_map_free(PPCGGen->sizes); + isl_union_map_free(PPCGGen->used_sizes); + free(PPCGGen); + } + + /// Free the options in the ppcg scop structure. + /// + /// ppcg is not freeing these options for us. To avoid leaks we do this + /// ourselves. + /// + /// @param PPCGScop The scop referencing the options to free. + void freeOptions(ppcg_scop *PPCGScop) { + free(PPCGScop->options->debug); + PPCGScop->options->debug = nullptr; + free(PPCGScop->options); + PPCGScop->options = nullptr; + } + + /// Approximate the number of points in the set. + /// + /// This function returns an ast expression that overapproximates the number + /// of points in an isl set through the rectangular hull surrounding this set. + /// + /// @param Set The set to count. + /// @param Build The isl ast build object to use for creating the ast + /// expression. + /// + /// @returns An approximation of the number of points in the set. + __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, + __isl_keep isl_ast_build *Build) { + + isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); + auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); + + isl_space *Space = isl_set_get_space(Set); + Space = isl_space_params(Space); + auto *Univ = isl_set_universe(Space); + isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); + + for (long i = 0, n = isl_set_dim(Set, isl_dim_set); i < n; i++) { + isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); + isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); + isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); + DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); + auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); + Expr = isl_ast_expr_mul(Expr, DimSizeExpr); + } + + isl_set_free(Set); + isl_pw_aff_free(OneAff); + + return Expr; + } + + /// Approximate a number of dynamic instructions executed by a given + /// statement. + /// + /// @param Stmt The statement for which to compute the number of dynamic + /// instructions. + /// @param Build The isl ast build object to use for creating the ast + /// expression. + /// @returns An approximation of the number of dynamic instructions executed + /// by @p Stmt. + __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, + __isl_keep isl_ast_build *Build) { + auto Iterations = approxPointsInSet(Stmt.getDomain().release(), Build); + + long InstCount = 0; + + if (Stmt.isBlockStmt()) { + auto *BB = Stmt.getBasicBlock(); + InstCount = std::distance(BB->begin(), BB->end()); + } else { + auto *R = Stmt.getRegion(); + + for (auto *BB : R->blocks()) { + InstCount += std::distance(BB->begin(), BB->end()); + } + } + + isl_val *InstVal = isl_val_int_from_si(S->getIslCtx().get(), InstCount); + auto *InstExpr = isl_ast_expr_from_val(InstVal); + return isl_ast_expr_mul(InstExpr, Iterations); + } + + /// Approximate dynamic instructions executed in scop. + /// + /// @param S The scop for which to approximate dynamic instructions. + /// @param Build The isl ast build object to use for creating the ast + /// expression. + /// @returns An approximation of the number of dynamic instructions executed + /// in @p S. + __isl_give isl_ast_expr * + getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { + isl_ast_expr *Instructions; + + isl_val *Zero = isl_val_int_from_si(S.getIslCtx().get(), 0); + Instructions = isl_ast_expr_from_val(Zero); + + for (ScopStmt &Stmt : S) { + isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); + Instructions = isl_ast_expr_add(Instructions, StmtInstructions); + } + return Instructions; + } + + /// Create a check that ensures sufficient compute in scop. + /// + /// @param S The scop for which to ensure sufficient compute. + /// @param Build The isl ast build object to use for creating the ast + /// expression. + /// @returns An expression that evaluates to TRUE in case of sufficient + /// compute and to FALSE, otherwise. + __isl_give isl_ast_expr * + createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { + auto Iterations = getNumberOfIterations(S, Build); + auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx().get(), MinCompute); + auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); + return isl_ast_expr_ge(Iterations, MinComputeExpr); + } + + /// Check if the basic block contains a function we cannot codegen for GPU + /// kernels. + /// + /// If this basic block does something with a `Function` other than calling + /// a function that we support in a kernel, return true. + bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB, + bool AllowCUDALibDevice) { + for (const Instruction &Inst : *BB) { + const CallInst *Call = dyn_cast(&Inst); + if (Call && isValidFunctionInKernel(Call->getCalledFunction(), + AllowCUDALibDevice)) + continue; + + for (Value *Op : Inst.operands()) + // Look for functions among operands of Inst. + if (isa(Op->stripPointerCasts())) { + LLVM_DEBUG(dbgs() + << Inst << " has illegal use of function in kernel.\n"); + return true; + } + } + return false; + } + + /// Return whether the Scop S uses functions in a way that we do not support. + bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) { + for (auto &Stmt : S) { + if (Stmt.isBlockStmt()) { + if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(), + AllowCUDALibDevice)) + return true; + } else { + assert(Stmt.isRegionStmt() && + "Stmt was neither block nor region statement"); + for (const BasicBlock *BB : Stmt.getRegion()->blocks()) + if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice)) + return true; + } + } + return false; + } + + /// Generate code for a given GPU AST described by @p Root. + /// + /// @param Root An isl_ast_node pointing to the root of the GPU AST. + /// @param Prog The GPU Program to generate code for. + void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { + ScopAnnotator Annotator; + Annotator.buildAliasScopes(*S); + + Region *R = &S->getRegion(); + + simplifyRegion(R, DT, LI, RI); + + BasicBlock *EnteringBB = R->getEnteringBlock(); + + PollyIRBuilder Builder(EnteringBB->getContext(), ConstantFolder(), + IRInserter(Annotator)); + Builder.SetInsertPoint(EnteringBB->getTerminator()); + + // Only build the run-time condition and parameters _after_ having + // introduced the conditional branch. This is important as the conditional + // branch will guard the original scop from new induction variables that + // the SCEVExpander may introduce while code generating the parameters and + // which may introduce scalar dependences that prevent us from correctly + // code generating this scop. + BBPair StartExitBlocks; + BranchInst *CondBr = nullptr; + std::tie(StartExitBlocks, CondBr) = + executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); + BasicBlock *StartBlock = std::get<0>(StartExitBlocks); + + assert(CondBr && "CondBr not initialized by executeScopConditionally"); + + GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, + StartBlock, Prog, Runtime, Architecture); + + // TODO: Handle LICM + auto SplitBlock = StartBlock->getSinglePredecessor(); + Builder.SetInsertPoint(SplitBlock->getTerminator()); + + isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx().get()); + isl::ast_expr Condition = + IslAst::buildRunCondition(*S, isl::manage_copy(Build)); + isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); + Condition = + isl::manage(isl_ast_expr_and(Condition.release(), SufficientCompute)); + isl_ast_build_free(Build); + + // preload invariant loads. Note: This should happen before the RTC + // because the RTC may depend on values that are invariant load hoisted. + if (!NodeBuilder.preloadInvariantLoads()) { + // Patch the introduced branch condition to ensure that we always execute + // the original SCoP. + auto *FalseI1 = Builder.getFalse(); + auto *SplitBBTerm = Builder.GetInsertBlock()->getTerminator(); + SplitBBTerm->setOperand(0, FalseI1); + + LLVM_DEBUG(dbgs() << "preloading invariant loads failed in function: " + + S->getFunction().getName() + + " | Scop Region: " + S->getNameStr()); + // adjust the dominator tree accordingly. + auto *ExitingBlock = StartBlock->getUniqueSuccessor(); + assert(ExitingBlock); + auto *MergeBlock = ExitingBlock->getUniqueSuccessor(); + assert(MergeBlock); + polly::markBlockUnreachable(*StartBlock, Builder); + polly::markBlockUnreachable(*ExitingBlock, Builder); + auto *ExitingBB = S->getExitingBlock(); + assert(ExitingBB); + + DT->changeImmediateDominator(MergeBlock, ExitingBB); + DT->eraseNode(ExitingBlock); + isl_ast_node_free(Root); + } else { + + if (polly::PerfMonitoring) { + PerfMonitor P(*S, EnteringBB->getParent()->getParent()); + P.initialize(); + P.insertRegionStart(SplitBlock->getTerminator()); + + // TODO: actually think if this is the correct exiting block to place + // the `end` performance marker. Invariant load hoisting changes + // the CFG in a way that I do not precisely understand, so I + // (Siddharth) should come back to this and + // think about which exiting block to use. + auto *ExitingBlock = StartBlock->getUniqueSuccessor(); + assert(ExitingBlock); + BasicBlock *MergeBlock = ExitingBlock->getUniqueSuccessor(); + P.insertRegionEnd(MergeBlock->getTerminator()); + } + + NodeBuilder.addParameters(S->getContext().release()); + Value *RTC = NodeBuilder.createRTC(Condition.release()); + Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); + + Builder.SetInsertPoint(&*StartBlock->begin()); + + NodeBuilder.create(Root); + } + + /// In case a sequential kernel has more surrounding loops as any parallel + /// kernel, the SCoP is probably mostly sequential. Hence, there is no + /// point in running it on a GPU. + if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) + CondBr->setOperand(0, Builder.getFalse()); + + if (!NodeBuilder.BuildSuccessful) + CondBr->setOperand(0, Builder.getFalse()); + } + + bool runOnScop(Scop &CurrentScop) override { + S = &CurrentScop; + LI = &getAnalysis().getLoopInfo(); + DT = &getAnalysis().getDomTree(); + SE = &getAnalysis().getSE(); + DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); + RI = &getAnalysis().getRegionInfo(); + + LLVM_DEBUG(dbgs() << "PPCGCodeGen running on : " << getUniqueScopName(S) + << " | loop depth: " << S->getMaxLoopDepth() << "\n"); + + // We currently do not support functions other than intrinsics inside + // kernels, as code generation will need to offload function calls to the + // kernel. This may lead to a kernel trying to call a function on the host. + // This also allows us to prevent codegen from trying to take the + // address of an intrinsic function to send to the kernel. + if (containsInvalidKernelFunction(CurrentScop, + Architecture == GPUArch::NVPTX64)) { + LLVM_DEBUG( + dbgs() << getUniqueScopName(S) + << " contains function which cannot be materialised in a GPU " + "kernel. Bailing out.\n";); + return false; + } + + auto PPCGScop = createPPCGScop(); + auto PPCGProg = createPPCGProg(PPCGScop); + auto PPCGGen = generateGPU(PPCGScop, PPCGProg); + + if (PPCGGen->tree) { + generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); + CurrentScop.markAsToBeSkipped(); + } else { + LLVM_DEBUG(dbgs() << getUniqueScopName(S) + << " has empty PPCGGen->tree. Bailing out.\n"); + } + + freeOptions(PPCGScop); + freePPCGGen(PPCGGen); + gpu_prog_free(PPCGProg); + ppcg_scop_free(PPCGScop); + + return true; + } + + void printScop(raw_ostream &, Scop &) const override {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + // FIXME: We do not yet add regions for the newly generated code to the + // region tree. + } +}; +} // namespace + +char PPCGCodeGeneration::ID = 1; + +Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { + PPCGCodeGeneration *generator = new PPCGCodeGeneration(); + generator->Runtime = Runtime; + generator->Architecture = Arch; + return generator; +} + +INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", + "Polly - Apply PPCG translation to SCOP", false, false) +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); +INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", + "Polly - Apply PPCG translation to SCOP", false, false) diff --git a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp --- a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp +++ b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "polly/CodeGen/RuntimeDebugBuilder.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Module.h" #include #include @@ -16,16 +17,6 @@ using namespace llvm; using namespace polly; -llvm::Value *RuntimeDebugBuilder::getPrintableString(PollyIRBuilder &Builder, - llvm::StringRef Str) { - // FIXME: addressspace(4) is a marker for a string (for the %s conversion - // specifier) but should be using the default address space. This only works - // because CPU backends typically ignore the address space. For constant - // strings as returned by getPrintableString, the format string should instead - // directly spell out the string. - return Builder.CreateGlobalStringPtr(Str, "", 4); -} - Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); const char *Name = "vprintf"; @@ -42,9 +33,72 @@ return F; } -void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, +Function *RuntimeDebugBuilder::getAddressSpaceCast(PollyIRBuilder &Builder, + unsigned Src, unsigned Dst, + unsigned SrcBits, + unsigned DstBits) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + auto Name = std::string("llvm.nvvm.ptr.constant.to.gen.p") + + std::to_string(Dst) + "i" + std::to_string(DstBits) + ".p" + + std::to_string(Src) + "i" + std::to_string(SrcBits); + Function *F = M->getFunction(Name); + + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + FunctionType *Ty = FunctionType::get( + PointerType::get(Builder.getIntNTy(DstBits), Dst), + PointerType::get(Builder.getIntNTy(SrcBits), Src), false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return F; +} + +std::vector +RuntimeDebugBuilder::getGPUThreadIdentifiers(PollyIRBuilder &Builder) { + std::vector Identifiers; + + auto M = Builder.GetInsertBlock()->getParent()->getParent(); + + std::vector BlockIDs = { + Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_x), + Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_y), + Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_z), + }; + + Identifiers.push_back(Builder.CreateGlobalStringPtr("> block-id: ", "", 4)); + for (auto GetID : BlockIDs) { + Value *Id = Builder.CreateCall(GetID, {}); + Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false); + Identifiers.push_back(Id); + Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4)); + } + + Identifiers.push_back(Builder.CreateGlobalStringPtr("| ", "", 4)); + + std::vector ThreadIDs = { + Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_x), + Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_y), + Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_z), + }; + + Identifiers.push_back(Builder.CreateGlobalStringPtr("thread-id: ", "", 4)); + for (auto GetId : ThreadIDs) { + Value *Id = Builder.CreateCall(GetId, {}); + Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false); + Identifiers.push_back(Id); + Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4)); + } + + return Identifiers; +} + +void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, bool IsGPU, ArrayRef Values) { - createCPUPrinterT(Builder, Values); + if (IsGPU) + createGPUPrinterT(Builder, Values); + else + createCPUPrinterT(Builder, Values); } bool RuntimeDebugBuilder::isPrintable(Type *Ty) { @@ -115,6 +169,78 @@ createFlush(Builder); } +void RuntimeDebugBuilder::createGPUPrinterT(PollyIRBuilder &Builder, + ArrayRef Values) { + std::string str; + + auto *Zero = Builder.getInt64(0); + + auto ToPrint = getGPUThreadIdentifiers(Builder); + + ToPrint.push_back(Builder.CreateGlobalStringPtr("\n ", "", 4)); + ToPrint.insert(ToPrint.end(), Values.begin(), Values.end()); + + const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); + + // Allocate print buffer (assuming 2*32 bit per element) + auto T = ArrayType::get(Builder.getInt32Ty(), ToPrint.size() * 2); + Value *Data = new AllocaInst( + T, DL.getAllocaAddrSpace(), "polly.vprint.buffer", + &Builder.GetInsertBlock()->getParent()->getEntryBlock().front()); + auto *DataPtr = Builder.CreateGEP(T, Data, {Zero, Zero}); + + int Offset = 0; + for (auto Val : ToPrint) { + auto Ptr = Builder.CreateGEP(Builder.getInt32Ty(), DataPtr, + Builder.getInt64(Offset)); + Type *Ty = Val->getType(); + + if (Ty->isFloatingPointTy()) { + if (!Ty->isDoubleTy()) + Val = Builder.CreateFPExt(Val, Builder.getDoubleTy()); + } else if (Ty->isIntegerTy()) { + if (Ty->getIntegerBitWidth() < 64) { + Val = Builder.CreateSExt(Val, Builder.getInt64Ty()); + } else { + assert(Ty->getIntegerBitWidth() == 64 && + "Integer types larger 64 bit not supported"); + // fallthrough + } + } else if (isa(Ty)) { + if (Ty == Builder.getInt8PtrTy(4)) { + // Pointers in constant address space are printed as strings + Val = Builder.CreateGEP(Builder.getInt8Ty(), Val, Builder.getInt64(0)); + auto F = RuntimeDebugBuilder::getAddressSpaceCast(Builder, 4, 0); + Val = Builder.CreateCall(F, Val); + } else { + Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty()); + } + } else { + llvm_unreachable("Unknown type"); + } + + Ty = Val->getType(); + Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Ty->getPointerTo(5)); + Builder.CreateAlignedStore(Val, Ptr, Align(4)); + + if (Ty->isFloatingPointTy()) + str += "%f"; + else if (Ty->isIntegerTy()) + str += "%ld"; + else + str += "%s"; + + Offset += 2; + } + + Value *Format = Builder.CreateGlobalStringPtr(str, "polly.vprintf.buffer", 4); + Format = Builder.CreateCall(getAddressSpaceCast(Builder, 4, 0), Format); + + Data = Builder.CreateBitCast(Data, Builder.getInt8PtrTy()); + + Builder.CreateCall(getVPrintF(Builder), {Format, Data}); +} + Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); const char *Name = "printf"; diff --git a/polly/lib/External/CMakeLists.txt b/polly/lib/External/CMakeLists.txt --- a/polly/lib/External/CMakeLists.txt +++ b/polly/lib/External/CMakeLists.txt @@ -314,3 +314,91 @@ target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS}) target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS}) endif (POLLY_BUNDLED_ISL) + + +# External: Polyhedral Parallel Code Generator +if (GPU_CODEGEN) + set(PET_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pet") + set(PPCG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ppcg") + set(PPCG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/ppcg") + + # Determine version of ppcg + if (EXISTS "${PPCG_SOURCE_DIR}/GIT_HEAD_ID") + # The source comes from a 'make dist' archive + file(READ "${PPCG_SOURCE_DIR}/GIT_HEAD_ID" PPCG_GIT_HEAD_ID) + string(STRIP "${PPCG_GIT_HEAD_ID}" PPCG_GIT_HEAD_ID) + elseif (EXISTS "${PPCG_SOURCE_DIR}/gitversion.h") + # The source directory is preconfigured + file(READ "${PPCG_SOURCE_DIR}/gitversion.h" GITVERSION_H) + string(REGEX REPLACE ".*\\\"([^\\\"]*)\\\".*" "\\1" PPCG_GIT_HEAD_ID "${GITVERSION_H}") + elseif () + # Unknown revision + # TODO: We could look for a .git and get the revision from HEAD + set(PPCG_GIT_HEAD_ID "UNKNOWN") + endif () + + message(STATUS "PPCG version: ${PPCG_GIT_HEAD_ID}") + + set (PPCG_FILES + ppcg/cuda.c + ppcg/cuda_common.c + ppcg/external.c + ppcg/gpu_array_tile.c + ppcg/gpu.c + ppcg/gpu_array_tile.c + ppcg/gpu_group.c + ppcg/gpu_hybrid.c + ppcg/gpu_print.c + ppcg/gpu_tree.c + ppcg/grouping.c + ppcg/hybrid.c + ppcg/ppcg.c + ppcg/ppcg_options.c + ppcg/print.c + ppcg/schedule.c + ppcg/util.c + ) + + include_directories(BEFORE + ${PPCG_BINARY_DIR} + ${PPCG_SOURCE_DIR}/imath + ${PPCG_SOURCE_DIR}/include + ${PET_SOURCE_DIR}/include + ) + + add_polly_library(PollyPPCG + ${PPCG_FILES} + ) + + target_link_libraries(PollyPPCG PUBLIC ${ISL_TARGET}) + + # Disable warnings for upstream projects. + if (MSVC) + set(DISABLE_WARNING_FLAGS + -wd4018 # 'expression' : signed/unsigned mismatch + -wd4090 # 'operation' : different 'modifier' qualifiers + -wd4200 # nonstandard extension used: zero-sized array in struct/union + -wd4201 # nonstandard extension used: nameless struct/union + -wd4334 # 'operator': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?) + -wd4221 # nonstandard extension used : 'identifier' : cannot be initialized using address of automatic variable + ) + if (POLLY_BUNDLED_ISL) + target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS}) + target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS}) + endif (POLLY_BUNDLED_ISL) + target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS}) + else () + if (POLLY_BUNDLED_ISL) + set_target_properties(PollyISL polly-isl-test PROPERTIES COMPILE_FLAGS "-w") + endif (POLLY_BUNDLED_ISL) + set_target_properties(PollyPPCG PROPERTIES COMPILE_FLAGS "-w") + endif () + + if(MSVC) + # In the Windows API (with some exceptions), the maximum length for a path is + # MAX_PATH, which is defined as 260 characters. + target_compile_definitions(PollyPPCG PRIVATE "-DPATH_MAX=260") + endif () + + target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS}) +endif () diff --git a/polly/lib/External/ppcg/ChangeLog b/polly/lib/External/ppcg/ChangeLog new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/ChangeLog @@ -0,0 +1,29 @@ +version: 0.07 +date: Tue Feb 7 17:23:22 CET 2017 +changes: + - support hybrid tiling +--- +version: 0.06 +date: Fri May 6 12:08:50 CEST 2016 +changes: + - use PPCG specific macro names in generated code + - complete transition to schedule trees + - maximize coincidence by default + - map arrays with constant index expressions to private memory + - optionally group chains of statements +--- +version: 0.05 +date: Fri Jan 15 09:30:23 CET 2016 +changes: + - fix live-out computation + - optionally compute schedule for C target + - optionally perform tiling for C target + - create single kernel for non-permutable subtree +--- +version: 0.04 +date: Wed Jun 17 10:52:58 CEST 2015 +changes: + - use schedule trees + - fix live-range reordering + - improve generation of synchronization + - exploit independences during dependence analysis diff --git a/polly/lib/External/ppcg/GIT_HEAD_ID b/polly/lib/External/ppcg/GIT_HEAD_ID new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/GIT_HEAD_ID @@ -0,0 +1 @@ +ppcg-0.07 diff --git a/polly/lib/External/ppcg/README b/polly/lib/External/ppcg/README new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/README @@ -0,0 +1,246 @@ +Requirements: + +- automake, autoconf, libtool + (not needed when compiling a release) +- pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config) + (not needed when compiling a release using the included isl and pet) +- gmp (http://gmplib.org/) +- libyaml (http://pyyaml.org/wiki/LibYAML) + (only needed if you want to compile the pet executable) +- LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html) + Unless you have some other reasons for wanting to use the svn version, + it is best to install the latest release (3.9). + For more details, see pet/README. + +If you are installing on Ubuntu, then you can install the following packages: + +automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm + +Note that you need at least version 3.2 of libclang-dev (ubuntu raring). +Older versions of this package did not include the required libraries. +If you are using an older version of ubuntu, then you need to compile and +install LLVM/clang from source. + + +Preparing: + +Grab the latest release and extract it or get the source from +the git repository as follows. This process requires autoconf, +automake, libtool and pkg-config. + + git clone git://repo.or.cz/ppcg.git + cd ppcg + ./get_submodules.sh + ./autogen.sh + + +Compilation: + + ./configure + make + make check + +If you have installed any of the required libraries in a non-standard +location, then you may need to use the --with-gmp-prefix, +--with-libyaml-prefix and/or --with-clang-prefix options +when calling "./configure". + + +Using PPCG to generate CUDA or OpenCL code + +To convert a fragment of a C program to CUDA, insert a line containing + + #pragma scop + +before the fragment and add a line containing + + #pragma endscop + +after the fragment. To generate CUDA code run + + ppcg --target=cuda file.c + +where file.c is the file containing the fragment. The generated +code is stored in file_host.cu and file_kernel.cu. + +To generate OpenCL code run + + ppcg --target=opencl file.c + +where file.c is the file containing the fragment. The generated code +is stored in file_host.c and file_kernel.cl. + + +Specifying tile, grid and block sizes + +The iterations space tile size, grid size and block size can +be specified using the --sizes option. The argument is a union map +in isl notation mapping kernels identified by their sequence number +in a "kernel" space to singleton sets in the "tile", "grid" and "block" +spaces. The sizes are specified outermost to innermost. + +The dimension of the "tile" space indicates the (maximal) number of loop +dimensions to tile. The elements of the single integer tuple +specify the tile sizes in each dimension. +In case of hybrid tiling, the first element is half the size of +the tile in the time (sequential) dimension. The second element +specifies the number of elements in the base of the hexagon. +The remaining elements specify the tile sizes in the remaining space +dimensions. + +The dimension of the "grid" space indicates the (maximal) number of block +dimensions in the grid. The elements of the single integer tuple +specify the number of blocks in each dimension. + +The dimension of the "block" space indicates the (maximal) number of thread +dimensions in the grid. The elements of the single integer tuple +specify the number of threads in each dimension. + +For example, + + { kernel[0] -> tile[64,64]; kernel[i] -> block[16] : i != 4 } + +specifies that in kernel 0, two loops should be tiled with a tile +size of 64 in both dimensions and that all kernels except kernel 4 +should be run using a block of 16 threads. + +Since PPCG performs some scheduling, it can be difficult to predict +what exactly will end up in a kernel. If you want to specify +tile, grid or block sizes, you may want to run PPCG first with the defaults, +examine the kernels and then run PPCG again with the desired sizes. +Instead of examining the kernels, you can also specify the option +--dump-sizes on the first run to obtain the effectively used default sizes. + + +Compiling the generated CUDA code with nvcc + +To get optimal performance from nvcc, it is important to choose --arch +according to your target GPU. Specifically, use the flag "--arch sm_20" +for fermi, "--arch sm_30" for GK10x Kepler and "--arch sm_35" for +GK110 Kepler. We discourage the use of older cards as we have seen +correctness issues with compilation for older architectures. +Note that in the absence of any --arch flag, nvcc defaults to +"--arch sm_13". This will not only be slower, but can also cause +correctness issues. +If you want to obtain results that are identical to those obtained +by the original code, then you may need to disable some optimizations +by passing the "--fmad=false" option. + + +Compiling the generated OpenCL code with gcc + +To compile the host code you need to link against the file +ocl_utilities.c which contains utility functions used by the generated +OpenCL host code. To compile the host code with gcc, run + + gcc -std=c99 file_host.c ocl_utilities.c -lOpenCL + +Note that we have experienced the generated OpenCL code freezing +on some inputs (e.g., the PolyBench symm benchmark) when using +at least some version of the Nvidia OpenCL library, while the +corresponding CUDA code runs fine. +We have experienced no such freezes when using AMD, ARM or Intel +OpenCL libraries. + +By default, the compiled executable will need the _kernel.cl file at +run time. Alternatively, the option --opencl-embed-kernel-code may be +given to place the kernel code in a string literal. The kernel code is +then compiled into the host binary, such that the _kernel.cl file is no +longer needed at run time. Any kernel include files, in particular +those supplied using --opencl-include-file, will still be required at +run time. + + +Function calls + +Function calls inside the analyzed fragment are reproduced +in the CUDA or OpenCL code, but for now it is left to the user +to make sure that the functions that are being called are +available from the generated kernels. + +In the case of OpenCL code, the --opencl-include-file option +may be used to specify one or more files to be #include'd +from the generated code. These files may then contain +the definitions of the functions being called from the +program fragment. If the pathnames of the included files +are relative to the current directory, then you may need +to additionally specify the --opencl-compiler-options=-I. +to make sure that the files can be found by the OpenCL compiler. +The included files may contain definitions of types used by the +generated kernels. By default, PPCG generates definitions for +types as needed, but these definitions may collide with those in +the included files, as PPCG does not consider the contents of the +included files. The --no-opencl-print-kernel-types will prevent +PPCG from generating type definitions. + + +GNU extensions + +By default, PPCG may print out macro definitions that involve +GNU extensions such as __typeof__ and statement expressions. +Some compilers may not support these extensions. +In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918) +has been reported not to support __typeof__. +The use of these extensions can be turned off with the +--no-allow-gnu-extensions option. + + +Processing PolyBench + +When processing a PolyBench/C 3.2 benchmark, you should always specify +-DPOLYBENCH_USE_C99_PROTO on the ppcg command line. Otherwise, the source +files are inconsistent, having fixed size arrays but parametrically +bounded loops iterating over them. +However, you should not specify this define when compiling +the PPCG generated code using nvcc since CUDA does not support VLAs. + + +CUDA and function overloading + +While CUDA supports function overloading based on the arguments types, +no such function overloading exists in the input language C. Since PPCG +simply prints out the same function name as in the original code, this +may result in a different function being called based on the types +of the arguments. For example, if the original code contains a call +to the function sqrt() with a float argument, then the argument will +be promoted to a double and the sqrt() function will be called. +In the transformed (CUDA) code, however, overloading will cause the +function sqrtf() to be called. Until this issue has been resolved in PPCG, +we recommend that users either explicitly call the function sqrtf() or +explicitly cast the argument to double in the input code. + + +Contact + +For bug reports, feature requests and questions, +contact http://groups.google.com/group/isl-development + +Whenever you report a bug, please mention the exact version of PPCG +that you are using (output of "./ppcg --version"). If you are unable +to compile PPCG, then report the git version (output of "git describe") +or the version number included in the name of the tarball. + + +Citing PPCG + +If you use PPCG for your research, you are invited to cite +the following paper. + +@article{Verdoolaege2013PPCG, + author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and + G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and + Catthoor, Francky}, + title = {Polyhedral parallel code generation for CUDA}, + journal = {ACM Trans. Archit. Code Optim.}, + issue_date = {January 2013}, + volume = {9}, + number = {4}, + month = jan, + year = {2013}, + issn = {1544-3566}, + pages = {54:1--54:23}, + doi = {10.1145/2400682.2400713}, + acmid = {2400713}, + publisher = {ACM}, + address = {New York, NY, USA}, +} diff --git a/polly/lib/External/ppcg/cpu.h b/polly/lib/External/ppcg/cpu.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/cpu.h @@ -0,0 +1,15 @@ +#ifndef _CPU_H +#define _CPU_H + +#include + +#include "ppcg.h" + +struct ppcg_options; + +__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, + struct ppcg_scop *ps, struct ppcg_options *options); +int generate_cpu(isl_ctx *ctx, struct ppcg_options *options, + const char *input, const char *output); + +#endif diff --git a/polly/lib/External/ppcg/cpu.c b/polly/lib/External/ppcg/cpu.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/cpu.c @@ -0,0 +1,802 @@ +/* + * Copyright 2012 INRIA Paris-Rocquencourt + * Copyright 2012 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Tobias Grosser, INRIA Paris-Rocquencourt, + * Domaine de Voluceau, Rocquenqourt, B.P. 105, + * 78153 Le Chesnay Cedex France + * and Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ppcg.h" +#include "ppcg_options.h" +#include "cpu.h" +#include "print.h" +#include "schedule.h" +#include "util.h" + +/* Representation of a statement inside a generated AST. + * + * "stmt" refers to the original statement. + * "ref2expr" maps the reference identifier of each access in + * the statement to an AST expression that should be printed + * at the place of the access. + */ +struct ppcg_stmt { + struct pet_stmt *stmt; + + isl_id_to_ast_expr *ref2expr; +}; + +static void ppcg_stmt_free(void *user) +{ + struct ppcg_stmt *stmt = user; + + if (!stmt) + return; + + isl_id_to_ast_expr_free(stmt->ref2expr); + + free(stmt); +} + +/* Derive the output file name from the input file name. + * 'input' is the entire path of the input file. The output + * is the file name plus the additional extension. + * + * We will basically replace everything after the last point + * with '.ppcg.c'. This means file.c becomes file.ppcg.c + */ +static FILE *get_output_file(const char *input, const char *output) +{ + char name[PATH_MAX]; + const char *ext; + const char ppcg_marker[] = ".ppcg"; + int len; + FILE *file; + + len = ppcg_extract_base_name(name, input); + + strcpy(name + len, ppcg_marker); + ext = strrchr(input, '.'); + strcpy(name + len + sizeof(ppcg_marker) - 1, ext ? ext : ".c"); + + if (!output) + output = name; + + file = fopen(output, "w"); + if (!file) { + fprintf(stderr, "Unable to open '%s' for writing\n", output); + return NULL; + } + + return file; +} + +/* Data used to annotate for nodes in the ast. + */ +struct ast_node_userinfo { + /* The for node is an openmp parallel for node. */ + int is_openmp; +}; + +/* Information used while building the ast. + */ +struct ast_build_userinfo { + /* The current ppcg scop. */ + struct ppcg_scop *scop; + + /* Are we currently in a parallel for loop? */ + int in_parallel_for; +}; + +/* Check if the current scheduling dimension is parallel. + * + * We check for parallelism by verifying that the loop does not carry any + * dependences. + * If the live_range_reordering option is set, then this currently + * includes the order dependences. In principle, non-zero order dependences + * could be allowed, but this would require privatization and/or expansion. + * + * Parallelism test: if the distance is zero in all outer dimensions, then it + * has to be zero in the current dimension as well. + * Implementation: first, translate dependences into time space, then force + * outer dimensions to be equal. If the distance is zero in the current + * dimension, then the loop is parallel. + * The distance is zero in the current dimension if it is a subset of a map + * with equal values for the current dimension. + */ +static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build, + struct ppcg_scop *scop) +{ + isl_union_map *schedule, *deps; + isl_map *schedule_deps, *test; + isl_space *schedule_space; + unsigned i, dimension, is_parallel; + + schedule = isl_ast_build_get_schedule(build); + schedule_space = isl_ast_build_get_schedule_space(build); + + dimension = isl_space_dim(schedule_space, isl_dim_out) - 1; + + deps = isl_union_map_copy(scop->dep_flow); + deps = isl_union_map_union(deps, isl_union_map_copy(scop->dep_false)); + if (scop->options->live_range_reordering) { + isl_union_map *order = isl_union_map_copy(scop->dep_order); + deps = isl_union_map_union(deps, order); + } + deps = isl_union_map_apply_range(deps, isl_union_map_copy(schedule)); + deps = isl_union_map_apply_domain(deps, schedule); + + if (isl_union_map_is_empty(deps)) { + isl_union_map_free(deps); + isl_space_free(schedule_space); + return 1; + } + + schedule_deps = isl_map_from_union_map(deps); + + for (i = 0; i < dimension; i++) + schedule_deps = isl_map_equate(schedule_deps, isl_dim_out, i, + isl_dim_in, i); + + test = isl_map_universe(isl_map_get_space(schedule_deps)); + test = isl_map_equate(test, isl_dim_out, dimension, isl_dim_in, + dimension); + is_parallel = isl_map_is_subset(schedule_deps, test); + + isl_space_free(schedule_space); + isl_map_free(test); + isl_map_free(schedule_deps); + + return is_parallel; +} + +/* Mark a for node openmp parallel, if it is the outermost parallel for node. + */ +static void mark_openmp_parallel(__isl_keep isl_ast_build *build, + struct ast_build_userinfo *build_info, + struct ast_node_userinfo *node_info) +{ + if (build_info->in_parallel_for) + return; + + if (ast_schedule_dim_is_parallel(build, build_info->scop)) { + build_info->in_parallel_for = 1; + node_info->is_openmp = 1; + } +} + +/* Allocate an ast_node_info structure and initialize it with default values. + */ +static struct ast_node_userinfo *allocate_ast_node_userinfo() +{ + struct ast_node_userinfo *node_info; + node_info = (struct ast_node_userinfo *) + malloc(sizeof(struct ast_node_userinfo)); + node_info->is_openmp = 0; + return node_info; +} + +/* Free an ast_node_info structure. + */ +static void free_ast_node_userinfo(void *ptr) +{ + struct ast_node_userinfo *info; + info = (struct ast_node_userinfo *) ptr; + free(info); +} + +/* This method is executed before the construction of a for node. It creates + * an isl_id that is used to annotate the subsequently generated ast for nodes. + * + * In this function we also run the following analyses: + * + * - Detection of openmp parallel loops + */ +static __isl_give isl_id *ast_build_before_for( + __isl_keep isl_ast_build *build, void *user) +{ + isl_id *id; + struct ast_build_userinfo *build_info; + struct ast_node_userinfo *node_info; + + build_info = (struct ast_build_userinfo *) user; + node_info = allocate_ast_node_userinfo(); + id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info); + id = isl_id_set_free_user(id, free_ast_node_userinfo); + + mark_openmp_parallel(build, build_info, node_info); + + return id; +} + +/* This method is executed after the construction of a for node. + * + * It performs the following actions: + * + * - Reset the 'in_parallel_for' flag, as soon as we leave a for node, + * that is marked as openmp parallel. + * + */ +static __isl_give isl_ast_node *ast_build_after_for( + __isl_take isl_ast_node *node, __isl_keep isl_ast_build *build, + void *user) +{ + isl_id *id; + struct ast_build_userinfo *build_info; + struct ast_node_userinfo *info; + + id = isl_ast_node_get_annotation(node); + info = isl_id_get_user(id); + + if (info && info->is_openmp) { + build_info = (struct ast_build_userinfo *) user; + build_info->in_parallel_for = 0; + } + + isl_id_free(id); + + return node; +} + +/* Find the element in scop->stmts that has the given "id". + */ +static struct pet_stmt *find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id) +{ + int i; + + for (i = 0; i < scop->pet->n_stmt; ++i) { + struct pet_stmt *stmt = scop->pet->stmts[i]; + isl_id *id_i; + + id_i = isl_set_get_tuple_id(stmt->domain); + isl_id_free(id_i); + + if (id_i == id) + return stmt; + } + + isl_die(isl_id_get_ctx(id), isl_error_internal, + "statement not found", return NULL); +} + +/* Print a user statement in the generated AST. + * The ppcg_stmt has been attached to the node in at_each_domain. + */ +static __isl_give isl_printer *print_user(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user) +{ + struct ppcg_stmt *stmt; + isl_id *id; + + id = isl_ast_node_get_annotation(node); + stmt = isl_id_get_user(id); + isl_id_free(id); + + p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr); + + isl_ast_print_options_free(print_options); + + return p; +} + + +/* Print a for loop node as an openmp parallel loop. + * + * To print an openmp parallel loop we print a normal for loop, but add + * "#pragma openmp parallel for" in front. + * + * Variables that are declared within the body of this for loop are + * automatically openmp 'private'. Iterators declared outside of the + * for loop are automatically openmp 'shared'. As ppcg declares all iterators + * at the position where they are assigned, there is no need to explicitly mark + * variables. Their automatically assigned type is already correct. + * + * This function only generates valid OpenMP code, if the ast was generated + * with the 'atomic-bounds' option enabled. + * + */ +static __isl_give isl_printer *print_for_with_openmp( + __isl_keep isl_ast_node *node, __isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "#pragma omp parallel for"); + p = isl_printer_end_line(p); + + p = isl_ast_node_for_print(node, p, print_options); + + return p; +} + +/* Print a for node. + * + * Depending on how the node is annotated, we either print a normal + * for node or an openmp parallel for node. + */ +static __isl_give isl_printer *print_for(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user) +{ + isl_id *id; + int openmp; + + openmp = 0; + id = isl_ast_node_get_annotation(node); + + if (id) { + struct ast_node_userinfo *info; + + info = (struct ast_node_userinfo *) isl_id_get_user(id); + if (info && info->is_openmp) + openmp = 1; + } + + if (openmp) + p = print_for_with_openmp(node, p, print_options); + else + p = isl_ast_node_for_print(node, p, print_options); + + isl_id_free(id); + + return p; +} + +/* Index transformation callback for pet_stmt_build_ast_exprs. + * + * "index" expresses the array indices in terms of statement iterators + * "iterator_map" expresses the statement iterators in terms of + * AST loop iterators. + * + * The result expresses the array indices in terms of + * AST loop iterators. + */ +static __isl_give isl_multi_pw_aff *pullback_index( + __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user) +{ + isl_pw_multi_aff *iterator_map = user; + + iterator_map = isl_pw_multi_aff_copy(iterator_map); + return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map); +} + +/* Transform the accesses in the statement associated to the domain + * called by "node" to refer to the AST loop iterators, construct + * corresponding AST expressions using "build", + * collect them in a ppcg_stmt and annotate the node with the ppcg_stmt. + */ +static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node, + __isl_keep isl_ast_build *build, void *user) +{ + struct ppcg_scop *scop = user; + isl_ast_expr *expr, *arg; + isl_ctx *ctx; + isl_id *id; + isl_map *map; + isl_pw_multi_aff *iterator_map; + struct ppcg_stmt *stmt; + + ctx = isl_ast_node_get_ctx(node); + stmt = isl_calloc_type(ctx, struct ppcg_stmt); + if (!stmt) + goto error; + + expr = isl_ast_node_user_get_expr(node); + arg = isl_ast_expr_get_op_arg(expr, 0); + isl_ast_expr_free(expr); + id = isl_ast_expr_get_id(arg); + isl_ast_expr_free(arg); + stmt->stmt = find_stmt(scop, id); + isl_id_free(id); + if (!stmt->stmt) + goto error; + + map = isl_map_from_union_map(isl_ast_build_get_schedule(build)); + map = isl_map_reverse(map); + iterator_map = isl_pw_multi_aff_from_map(map); + stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build, + &pullback_index, iterator_map, NULL, NULL); + isl_pw_multi_aff_free(iterator_map); + + id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt); + id = isl_id_set_free_user(id, &ppcg_stmt_free); + return isl_ast_node_set_annotation(node, id); +error: + ppcg_stmt_free(stmt); + return isl_ast_node_free(node); +} + +/* Set *depth (initialized to 0 by the caller) to the maximum + * of the schedule depths of the leaf nodes for which this function is called. + */ +static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user) +{ + int *depth = user; + int node_depth; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf) + return isl_bool_true; + node_depth = isl_schedule_node_get_schedule_depth(node); + if (node_depth > *depth) + *depth = node_depth; + + return isl_bool_false; +} + +/* This function is called for each node in a CPU AST. + * In case of a user node, print the macro definitions required + * for printing the AST expressions in the annotation, if any. + * For other nodes, return true such that descendants are also + * visited. + * + * In particular, print the macro definitions needed for the substitutions + * of the original user statements. + */ +static isl_bool at_node(__isl_keep isl_ast_node *node, void *user) +{ + struct ppcg_stmt *stmt; + isl_id *id; + isl_printer **p = user; + + if (isl_ast_node_get_type(node) != isl_ast_node_user) + return isl_bool_true; + + id = isl_ast_node_get_annotation(node); + stmt = isl_id_get_user(id); + isl_id_free(id); + + if (!stmt) + return isl_bool_error; + + *p = ppcg_print_body_macros(*p, stmt->ref2expr); + if (!*p) + return isl_bool_error; + + return isl_bool_false; +} + +/* Print the required macros for the CPU AST "node" to "p", + * including those needed for the user statements inside the AST. + */ +static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node) +{ + if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0) + return isl_printer_free(p); + p = ppcg_print_macros(p, node); + return p; +} + +/* Code generate the scop 'scop' using "schedule" + * and print the corresponding C code to 'p'. + */ +static __isl_give isl_printer *print_scop(struct ppcg_scop *scop, + __isl_take isl_schedule *schedule, __isl_take isl_printer *p, + struct ppcg_options *options) +{ + isl_ctx *ctx = isl_printer_get_ctx(p); + isl_ast_build *build; + isl_ast_print_options *print_options; + isl_ast_node *tree; + isl_id_list *iterators; + struct ast_build_userinfo build_info; + int depth; + + depth = 0; + if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth, + &depth) < 0) + goto error; + + build = isl_ast_build_alloc(ctx); + iterators = ppcg_scop_generate_names(scop, depth, "c"); + build = isl_ast_build_set_iterators(build, iterators); + build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop); + + if (options->openmp) { + build_info.scop = scop; + build_info.in_parallel_for = 0; + + build = isl_ast_build_set_before_each_for(build, + &ast_build_before_for, + &build_info); + build = isl_ast_build_set_after_each_for(build, + &ast_build_after_for, + &build_info); + } + + tree = isl_ast_build_node_from_schedule(build, schedule); + isl_ast_build_free(build); + + print_options = isl_ast_print_options_alloc(ctx); + print_options = isl_ast_print_options_set_print_user(print_options, + &print_user, NULL); + + print_options = isl_ast_print_options_set_print_for(print_options, + &print_for, NULL); + + p = cpu_print_macros(p, tree); + p = isl_ast_node_print(tree, p, print_options); + + isl_ast_node_free(tree); + + return p; +error: + isl_schedule_free(schedule); + isl_printer_free(p); + return NULL; +} + +/* Tile the band node "node" with tile sizes "sizes" and + * mark all members of the resulting tile node as "atomic". + */ +static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node, + __isl_take isl_multi_val *sizes) +{ + node = isl_schedule_node_band_tile(node, sizes); + node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); + + return node; +} + +/* Tile "node", if it is a band node with at least 2 members. + * The tile sizes are set from the "tile_size" option. + */ +static __isl_give isl_schedule_node *tile_band( + __isl_take isl_schedule_node *node, void *user) +{ + struct ppcg_scop *scop = user; + int n; + isl_space *space; + isl_multi_val *sizes; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return node; + + n = isl_schedule_node_band_n_member(node); + if (n <= 1) + return node; + + space = isl_schedule_node_band_get_space(node); + sizes = ppcg_multi_val_from_int(space, scop->options->tile_size); + + return tile(node, sizes); +} + +/* Construct schedule constraints from the dependences in ps + * for the purpose of computing a schedule for a CPU. + * + * The proximity constraints are set to the flow dependences. + * + * If live-range reordering is allowed then the conditional validity + * constraints are set to the order dependences with the flow dependences + * as condition. That is, a live-range (flow dependence) will be either + * local to an iteration of a band or all adjacent order dependences + * will be respected by the band. + * The validity constraints are set to the union of the flow dependences + * and the forced dependences, while the coincidence constraints + * are set to the union of the flow dependences, the forced dependences and + * the order dependences. + * + * If live-range reordering is not allowed, then both the validity + * and the coincidence constraints are set to the union of the flow + * dependences and the false dependences. + * + * Note that the coincidence constraints are only set when the "openmp" + * options is set. Even though the way openmp pragmas are introduced + * does not rely on the coincident property of the schedule band members, + * the coincidence constraints do affect the way the schedule is constructed, + * such that more schedule dimensions should be detected as parallel + * by ast_schedule_dim_is_parallel. + * Since the order dependences are also taken into account by + * ast_schedule_dim_is_parallel, they are also added to + * the coincidence constraints. If the openmp handling learns + * how to privatize some memory, then the corresponding order + * dependences can be removed from the coincidence constraints. + */ +static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints( + struct ppcg_scop *ps) +{ + isl_schedule_constraints *sc; + isl_union_map *validity, *coincidence; + + sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain)); + if (ps->options->live_range_reordering) { + sc = isl_schedule_constraints_set_conditional_validity(sc, + isl_union_map_copy(ps->tagged_dep_flow), + isl_union_map_copy(ps->tagged_dep_order)); + validity = isl_union_map_copy(ps->dep_flow); + validity = isl_union_map_union(validity, + isl_union_map_copy(ps->dep_forced)); + if (ps->options->openmp) { + coincidence = isl_union_map_copy(validity); + coincidence = isl_union_map_union(coincidence, + isl_union_map_copy(ps->dep_order)); + } + } else { + validity = isl_union_map_copy(ps->dep_flow); + validity = isl_union_map_union(validity, + isl_union_map_copy(ps->dep_false)); + if (ps->options->openmp) + coincidence = isl_union_map_copy(validity); + } + if (ps->options->openmp) + sc = isl_schedule_constraints_set_coincidence(sc, coincidence); + sc = isl_schedule_constraints_set_validity(sc, validity); + sc = isl_schedule_constraints_set_proximity(sc, + isl_union_map_copy(ps->dep_flow)); + + return sc; +} + +/* Compute a schedule for the scop "ps". + * + * First derive the appropriate schedule constraints from the dependences + * in "ps" and then compute a schedule from those schedule constraints, + * possibly grouping statement instances based on the input schedule. + */ +static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps) +{ + isl_schedule_constraints *sc; + isl_schedule *schedule; + + if (!ps) + return NULL; + + sc = construct_cpu_schedule_constraints(ps); + + if (ps->options->debug->dump_schedule_constraints) + isl_schedule_constraints_dump(sc); + schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options); + + return schedule; +} + +/* Compute a new schedule to the scop "ps" if the reschedule option is set. + * Otherwise, return a copy of the original schedule. + */ +static __isl_give isl_schedule *optionally_compute_schedule(void *user) +{ + struct ppcg_scop *ps = user; + + if (!ps) + return NULL; + if (!ps->options->reschedule) + return isl_schedule_copy(ps->schedule); + return compute_cpu_schedule(ps); +} + +/* Compute a schedule based on the dependences in "ps" and + * tile it if requested by the user. + */ +static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps, + struct ppcg_options *options) +{ + isl_ctx *ctx; + isl_schedule *schedule; + + if (!ps) + return NULL; + + ctx = isl_union_set_get_ctx(ps->domain); + schedule = ppcg_get_schedule(ctx, options, + &optionally_compute_schedule, ps); + if (ps->options->tile) + schedule = isl_schedule_map_schedule_node_bottom_up(schedule, + &tile_band, ps); + + return schedule; +} + +/* Generate CPU code for the scop "ps" using "schedule" and + * print the corresponding C code to "p", including variable declarations. + */ +static __isl_give isl_printer *print_cpu_with_schedule( + __isl_take isl_printer *p, struct ppcg_scop *ps, + __isl_take isl_schedule *schedule, struct ppcg_options *options) +{ + int hidden; + isl_set *context; + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "/* ppcg generated CPU code */"); + p = isl_printer_end_line(p); + + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + + p = ppcg_set_macro_names(p); + p = ppcg_print_exposed_declarations(p, ps); + hidden = ppcg_scop_any_hidden_declarations(ps); + if (hidden) { + p = ppcg_start_block(p); + p = ppcg_print_hidden_declarations(p, ps); + } + + context = isl_set_copy(ps->context); + context = isl_set_from_params(context); + schedule = isl_schedule_insert_context(schedule, context); + if (options->debug->dump_final_schedule) + isl_schedule_dump(schedule); + p = print_scop(ps, schedule, p, options); + if (hidden) + p = ppcg_end_block(p); + + return p; +} + +/* Generate CPU code for the scop "ps" and print the corresponding C code + * to "p", including variable declarations. + */ +__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, + struct ppcg_scop *ps, struct ppcg_options *options) +{ + isl_schedule *schedule; + + schedule = isl_schedule_copy(ps->schedule); + return print_cpu_with_schedule(p, ps, schedule, options); +} + +/* Generate CPU code for "scop" and print it to "p". + * + * First obtain a schedule for "scop" and then print code for "scop" + * using that schedule. + */ +static __isl_give isl_printer *generate(__isl_take isl_printer *p, + struct ppcg_scop *scop, struct ppcg_options *options) +{ + isl_schedule *schedule; + + schedule = get_schedule(scop, options); + + return print_cpu_with_schedule(p, scop, schedule, options); +} + +/* Wrapper around generate for use as a ppcg_transform callback. + */ +static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p, + struct ppcg_scop *scop, void *user) +{ + struct ppcg_options *options = user; + + return generate(p, scop, options); +} + +/* Transform the code in the file called "input" by replacing + * all scops by corresponding CPU code and write the results to a file + * called "output". + */ +int generate_cpu(isl_ctx *ctx, struct ppcg_options *options, + const char *input, const char *output) +{ + FILE *output_file; + int r; + + output_file = get_output_file(input, output); + if (!output_file) + return -1; + + r = ppcg_transform(ctx, input, output_file, options, + &print_cpu_wrap, options); + + fclose(output_file); + + return r; +} diff --git a/polly/lib/External/ppcg/cuda.h b/polly/lib/External/ppcg/cuda.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/cuda.h @@ -0,0 +1,13 @@ +#ifndef _CUDA_H +#define _CUDA_H + +#include "ppcg_options.h" +#include "ppcg.h" + +int generate_cuda(isl_ctx *ctx, struct ppcg_options *options, + const char *input); + +__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user); +#endif diff --git a/polly/lib/External/ppcg/cuda.c b/polly/lib/External/ppcg/cuda.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/cuda.c @@ -0,0 +1,730 @@ +/* + * Copyright 2012 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France + */ + +#include +#include + +#include "cuda_common.h" +#include "cuda.h" +#include "gpu.h" +#include "gpu_print.h" +#include "print.h" +#include "util.h" + +static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p) +{ + const char *macros = + "#define cudaCheckReturn(ret) \\\n" + " do { \\\n" + " cudaError_t cudaCheckReturn_e = (ret); \\\n" + " if (cudaCheckReturn_e != cudaSuccess) { \\\n" + " fprintf(stderr, \"CUDA error: %s\\n\", " + "cudaGetErrorString(cudaCheckReturn_e)); \\\n" + " fflush(stderr); \\\n" + " } \\\n" + " assert(cudaCheckReturn_e == cudaSuccess); \\\n" + " } while(0)\n" + "#define cudaCheckKernel() \\\n" + " do { \\\n" + " cudaCheckReturn(cudaGetLastError()); \\\n" + " } while(0)\n\n"; + + p = isl_printer_print_str(p, macros); + return p; +} + +/* Print a declaration for the device array corresponding to "array" on "p". + */ +static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p, + struct gpu_array_info *array) +{ + int i; + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, array->type); + p = isl_printer_print_str(p, " "); + if (!array->linearize && array->n_index > 1) + p = isl_printer_print_str(p, "("); + p = isl_printer_print_str(p, "*dev_"); + p = isl_printer_print_str(p, array->name); + if (!array->linearize && array->n_index > 1) { + p = isl_printer_print_str(p, ")"); + for (i = 1; i < array->n_index; i++) { + isl_ast_expr *bound; + bound = isl_ast_expr_get_op_arg(array->bound_expr, + 1 + i); + p = isl_printer_print_str(p, "["); + p = isl_printer_print_ast_expr(p, bound); + p = isl_printer_print_str(p, "]"); + isl_ast_expr_free(bound); + } + } + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + if (!gpu_array_requires_device_allocation(&prog->array[i])) + continue; + + p = declare_device_array(p, &prog->array[i]); + } + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + return p; +} + +static __isl_give isl_printer *allocate_device_arrays( + __isl_take isl_printer *p, struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + + if (!gpu_array_requires_device_allocation(&prog->array[i])) + continue; + p = ppcg_ast_expr_print_macros(array->bound_expr, p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, + "cudaCheckReturn(cudaMalloc((void **) &dev_"); + p = isl_printer_print_str(p, prog->array[i].name); + p = isl_printer_print_str(p, ", "); + p = gpu_array_info_print_size(p, &prog->array[i]); + p = isl_printer_print_str(p, "));"); + p = isl_printer_end_line(p); + } + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + return p; +} + +static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + if (!gpu_array_requires_device_allocation(&prog->array[i])) + continue; + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_"); + p = isl_printer_print_str(p, prog->array[i].name); + p = isl_printer_print_str(p, "));"); + p = isl_printer_end_line(p); + } + + return p; +} + +/* Print code to "p" for copying "array" from the host to the device + * in its entirety. The bounds on the extent of "array" have + * been precomputed in extract_array_info and are used in + * gpu_array_info_print_size. + */ +static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p, + struct gpu_array_info *array) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", "); + + if (gpu_array_is_scalar(array)) + p = isl_printer_print_str(p, "&"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", "); + + p = gpu_array_info_print_size(p, array); + p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));"); + p = isl_printer_end_line(p); + + return p; +} + +/* Print code to "p" for copying "array" back from the device to the host + * in its entirety. The bounds on the extent of "array" have + * been precomputed in extract_array_info and are used in + * gpu_array_info_print_size. + */ +static __isl_give isl_printer *copy_array_from_device( + __isl_take isl_printer *p, struct gpu_array_info *array) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy("); + if (gpu_array_is_scalar(array)) + p = isl_printer_print_str(p, "&"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", dev_"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", "); + p = gpu_array_info_print_size(p, array); + p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));"); + p = isl_printer_end_line(p); + + return p; +} + +static __isl_give isl_printer* print_reverse_list(__isl_take isl_printer *p, int len, int *list) +{ + int i; + + if (len == 0) + return p; + + p = isl_printer_print_str(p, "("); + for (i = 0; i < len; ++i) { + if (i) + p = isl_printer_print_str(p, ", "); + p = isl_printer_print_int(p, list[len - 1 - i]); + } + return isl_printer_print_str(p, ")"); +} + +/* Print the effective grid size as a list of the sizes in each + * dimension, from innermost to outermost. + */ +static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p, + struct ppcg_kernel *kernel) +{ + int i; + int dim; + + dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set); + if (dim == 0) + return p; + + p = isl_printer_print_str(p, "("); + for (i = dim - 1; i >= 0; --i) { + isl_ast_expr *bound; + + bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i); + p = isl_printer_print_ast_expr(p, bound); + isl_ast_expr_free(bound); + + if (i > 0) + p = isl_printer_print_str(p, ", "); + } + + p = isl_printer_print_str(p, ")"); + + return p; +} + +/* Print the grid definition. + */ +static __isl_give isl_printer *print_grid(__isl_take isl_printer *p, + struct ppcg_kernel *kernel) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "dim3 k"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "_dimGrid"); + p = print_grid_size(p, kernel); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +/* Print the arguments to a kernel declaration or call. If "types" is set, + * then print a declaration (including the types of the arguments). + * + * The arguments are printed in the following order + * - the arrays accessed by the kernel + * - the parameters + * - the host loop iterators + */ +static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p, + struct gpu_prog *prog, struct ppcg_kernel *kernel, int types) +{ + int i, n; + int first = 1; + unsigned nparam; + isl_space *space; + const char *type; + + for (i = 0; i < prog->n_array; ++i) { + int required; + + required = ppcg_kernel_requires_array_argument(kernel, i); + if (required < 0) + return isl_printer_free(p); + if (!required) + continue; + + if (!first) + p = isl_printer_print_str(p, ", "); + + if (types) + p = gpu_array_info_print_declaration_argument(p, + &prog->array[i], NULL); + else + p = gpu_array_info_print_call_argument(p, + &prog->array[i]); + + first = 0; + } + + space = isl_union_set_get_space(kernel->arrays); + nparam = isl_space_dim(space, isl_dim_param); + for (i = 0; i < nparam; ++i) { + const char *name; + + name = isl_space_get_dim_name(space, isl_dim_param, i); + + if (!first) + p = isl_printer_print_str(p, ", "); + if (types) + p = isl_printer_print_str(p, "int "); + p = isl_printer_print_str(p, name); + + first = 0; + } + isl_space_free(space); + + n = isl_space_dim(kernel->space, isl_dim_set); + type = isl_options_get_ast_iterator_type(prog->ctx); + for (i = 0; i < n; ++i) { + const char *name; + + if (!first) + p = isl_printer_print_str(p, ", "); + name = isl_space_get_dim_name(kernel->space, isl_dim_set, i); + if (types) { + p = isl_printer_print_str(p, type); + p = isl_printer_print_str(p, " "); + } + p = isl_printer_print_str(p, name); + + first = 0; + } + + return p; +} + +/* Print the header of the given kernel. + */ +static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p, + struct gpu_prog *prog, struct ppcg_kernel *kernel) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "__global__ void kernel"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "("); + p = print_kernel_arguments(p, prog, kernel, 1); + p = isl_printer_print_str(p, ")"); + + return p; +} + +/* Print the header of the given kernel to both gen->cuda.kernel_h + * and gen->cuda.kernel_c. + */ +static void print_kernel_headers(struct gpu_prog *prog, + struct ppcg_kernel *kernel, struct cuda_info *cuda) +{ + isl_printer *p; + + p = isl_printer_to_file(prog->ctx, cuda->kernel_h); + p = isl_printer_set_output_format(p, ISL_FORMAT_C); + p = print_kernel_header(p, prog, kernel); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + isl_printer_free(p); + + p = isl_printer_to_file(prog->ctx, cuda->kernel_c); + p = isl_printer_set_output_format(p, ISL_FORMAT_C); + p = print_kernel_header(p, prog, kernel); + p = isl_printer_end_line(p); + isl_printer_free(p); +} + +static void print_indent(FILE *dst, int indent) +{ + fprintf(dst, "%*s", indent, ""); +} + +/* Print a list of iterators of type "type" with names "ids" to "out". + * Each iterator is assigned one of the cuda identifiers in cuda_dims. + * In particular, the last iterator is assigned the x identifier + * (the first in the list of cuda identifiers). + */ +static void print_iterators(FILE *out, const char *type, + __isl_keep isl_id_list *ids, const char *cuda_dims[]) +{ + int i, n; + + n = isl_id_list_n_id(ids); + if (n <= 0) + return; + print_indent(out, 4); + fprintf(out, "%s ", type); + for (i = 0; i < n; ++i) { + isl_id *id; + + if (i) + fprintf(out, ", "); + id = isl_id_list_get_id(ids, i); + fprintf(out, "%s = %s", isl_id_get_name(id), + cuda_dims[n - 1 - i]); + isl_id_free(id); + } + fprintf(out, ";\n"); +} + +static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel) +{ + isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree); + const char *type; + const char *block_dims[] = { "blockIdx.x", "blockIdx.y" }; + const char *thread_dims[] = { "threadIdx.x", "threadIdx.y", + "threadIdx.z" }; + + type = isl_options_get_ast_iterator_type(ctx); + + print_iterators(out, type, kernel->block_ids, block_dims); + print_iterators(out, type, kernel->thread_ids, thread_dims); +} + +static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p, + struct ppcg_kernel_var *var) +{ + int j; + + p = isl_printer_start_line(p); + if (var->type == ppcg_access_shared) + p = isl_printer_print_str(p, "__shared__ "); + p = isl_printer_print_str(p, var->array->type); + p = isl_printer_print_str(p, " "); + p = isl_printer_print_str(p, var->name); + for (j = 0; j < var->array->n_index; ++j) { + isl_val *v; + + p = isl_printer_print_str(p, "["); + v = isl_vec_get_element_val(var->size, j); + p = isl_printer_print_val(p, v); + isl_val_free(v); + p = isl_printer_print_str(p, "]"); + } + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p, + struct ppcg_kernel *kernel) +{ + int i; + + for (i = 0; i < kernel->n_var; ++i) + p = print_kernel_var(p, &kernel->var[i]); + + return p; +} + +/* Print a sync statement. + */ +static __isl_give isl_printer *print_sync(__isl_take isl_printer *p, + struct ppcg_kernel_stmt *stmt) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "__syncthreads();"); + p = isl_printer_end_line(p); + + return p; +} + +/* This function is called for each user statement in the AST, + * i.e., for each kernel body statement, copy statement or sync statement. + */ +static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user) +{ + isl_id *id; + struct ppcg_kernel_stmt *stmt; + + id = isl_ast_node_get_annotation(node); + stmt = isl_id_get_user(id); + isl_id_free(id); + + isl_ast_print_options_free(print_options); + + switch (stmt->type) { + case ppcg_kernel_copy: + return ppcg_kernel_print_copy(p, stmt); + case ppcg_kernel_sync: + return print_sync(p, stmt); + case ppcg_kernel_domain: + return ppcg_kernel_print_domain(p, stmt); + } + + return p; +} + +static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel, + struct cuda_info *cuda) +{ + isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree); + isl_ast_print_options *print_options; + isl_printer *p; + + print_kernel_headers(prog, kernel, cuda); + fprintf(cuda->kernel_c, "{\n"); + print_kernel_iterators(cuda->kernel_c, kernel); + + p = isl_printer_to_file(ctx, cuda->kernel_c); + p = isl_printer_set_output_format(p, ISL_FORMAT_C); + p = isl_printer_indent(p, 4); + + p = print_kernel_vars(p, kernel); + p = isl_printer_end_line(p); + p = ppcg_set_macro_names(p); + p = gpu_print_macros(p, kernel->tree); + + print_options = isl_ast_print_options_alloc(ctx); + print_options = isl_ast_print_options_set_print_user(print_options, + &print_kernel_stmt, NULL); + p = isl_ast_node_print(kernel->tree, p, print_options); + isl_printer_free(p); + + fprintf(cuda->kernel_c, "}\n"); +} + +/* Print code for initializing the device for execution of the transformed + * code. This includes declaring locally defined variables as well as + * declaring and allocating the required copies of arrays on the device. + */ +static __isl_give isl_printer *init_device(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + p = print_cuda_macros(p); + + p = gpu_print_local_declarations(p, prog); + p = declare_device_arrays(p, prog); + p = allocate_device_arrays(p, prog); + + return p; +} + +/* Print code for clearing the device after execution of the transformed code. + * In particular, free the memory that was allocated on the device. + */ +static __isl_give isl_printer *clear_device(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + p = free_device_arrays(p, prog); + + return p; +} + +/* Print a statement for copying an array to or from the device, + * or for initializing or clearing the device. + * The statement identifier of a copying node is called + * "to_device_" or "from_device_" and + * its user pointer points to the gpu_array_info of the array + * that needs to be copied. + * The node for initializing the device is called "init_device". + * The node for clearing the device is called "clear_device". + * + * Extract the array (if any) from the identifier and call + * init_device, clear_device, copy_array_to_device or copy_array_from_device. + */ +static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node, struct gpu_prog *prog) +{ + isl_ast_expr *expr, *arg; + isl_id *id; + const char *name; + struct gpu_array_info *array; + + expr = isl_ast_node_user_get_expr(node); + arg = isl_ast_expr_get_op_arg(expr, 0); + id = isl_ast_expr_get_id(arg); + name = isl_id_get_name(id); + array = isl_id_get_user(id); + isl_id_free(id); + isl_ast_expr_free(arg); + isl_ast_expr_free(expr); + + if (!name) + return isl_printer_free(p); + if (!strcmp(name, "init_device")) + return init_device(p, prog); + if (!strcmp(name, "clear_device")) + return clear_device(p, prog); + if (!array) + return isl_printer_free(p); + + if (!prefixcmp(name, "to_device")) + return copy_array_to_device(p, array); + else + return copy_array_from_device(p, array); +} + +struct print_host_user_data { + struct cuda_info *cuda; + struct gpu_prog *prog; +}; + +/* Print the user statement of the host code to "p". + * + * The host code may contain original user statements, kernel launches, + * statements that copy data to/from the device and statements + * the initialize or clear the device. + * The original user statements and the kernel launches have + * an associated annotation, while the other statements do not. + * The latter are handled by print_device_node. + * The annotation on the user statements is called "user". + * + * In case of a kernel launch, print a block of statements that + * defines the grid and the block and then launches the kernel. + */ +__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user) +{ + isl_id *id; + int is_user; + struct ppcg_kernel *kernel; + struct ppcg_kernel_stmt *stmt; + struct print_host_user_data *data; + + isl_ast_print_options_free(print_options); + + data = (struct print_host_user_data *) user; + + id = isl_ast_node_get_annotation(node); + if (!id) + return print_device_node(p, node, data->prog); + + is_user = !strcmp(isl_id_get_name(id), "user"); + kernel = is_user ? NULL : isl_id_get_user(id); + stmt = is_user ? isl_id_get_user(id) : NULL; + isl_id_free(id); + + if (is_user) + return ppcg_kernel_print_domain(p, stmt); + + p = ppcg_start_block(p); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "dim3 k"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "_dimBlock"); + p = print_reverse_list(p, kernel->n_block, kernel->block_dim); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + p = print_grid(p, kernel); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "kernel"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, " <<id); + p = isl_printer_print_str(p, "_dimGrid, k"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "_dimBlock>>> ("); + p = print_kernel_arguments(p, data->prog, kernel, 0); + p = isl_printer_print_str(p, ");"); + p = isl_printer_end_line(p); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckKernel();"); + p = isl_printer_end_line(p); + + p = ppcg_end_block(p); + + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + +#if 0 + print_kernel(data->prog, kernel, data->cuda); +#endif + + return p; +} + +static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p, + struct gpu_prog *prog, __isl_keep isl_ast_node *tree, + struct cuda_info *cuda) +{ + isl_ast_print_options *print_options; + isl_ctx *ctx = isl_ast_node_get_ctx(tree); + struct print_host_user_data data = { cuda, prog }; + + print_options = isl_ast_print_options_alloc(ctx); + print_options = isl_ast_print_options_set_print_user(print_options, + &print_host_user, &data); + + p = gpu_print_macros(p, tree); + p = isl_ast_node_print(tree, p, print_options); + + return p; +} + +/* Given a gpu_prog "prog" and the corresponding transformed AST + * "tree", print the entire CUDA code to "p". + * "types" collects the types for which a definition has already + * been printed. + */ +static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p, + struct gpu_prog *prog, __isl_keep isl_ast_node *tree, + struct gpu_types *types, void *user) +{ + struct cuda_info *cuda = user; + isl_printer *kernel; + + kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c); + kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C); + kernel = gpu_print_types(kernel, types, prog); + isl_printer_free(kernel); + + if (!kernel) + return isl_printer_free(p); + + p = print_host_code(p, prog, tree, cuda); + + return p; +} + +/* Transform the code in the file called "input" by replacing + * all scops by corresponding CUDA code. + * The names of the output files are derived from "input". + * + * We let generate_gpu do all the hard work and then let it call + * us back for printing the AST in print_cuda. + * + * To prepare for this printing, we first open the output files + * and we close them after generate_gpu has finished. + */ +int generate_cuda(isl_ctx *ctx, struct ppcg_options *options, + const char *input) +{ + struct cuda_info cuda; + int r; + + cuda_open_files(&cuda, input); + + r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda); + + cuda_close_files(&cuda); + + return r; +} diff --git a/polly/lib/External/ppcg/cuda_common.h b/polly/lib/External/ppcg/cuda_common.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/cuda_common.h @@ -0,0 +1,15 @@ +#ifndef _CUDA_COMMON_H_ +#define _CUDA_COMMON_H_ + +#include + +struct cuda_info { + FILE *host_c; + FILE *kernel_c; + FILE *kernel_h; +}; + +void cuda_open_files(struct cuda_info *info, const char *input); +void cuda_close_files(struct cuda_info *info); + +#endif diff --git a/polly/lib/External/ppcg/cuda_common.c b/polly/lib/External/ppcg/cuda_common.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/cuda_common.c @@ -0,0 +1,50 @@ +/* + * Copyright 2010 INRIA Saclay + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, + * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, + * 91893 Orsay, France + */ + +#include +#include +#include + +#include "cuda_common.h" +#include "ppcg.h" + +/* Open the host .cu file and the kernel .hu and .cu files for writing. + * Add the necessary includes. + */ +void cuda_open_files(struct cuda_info *info, const char *input) +{ + char name[PATH_MAX]; + int len; + + len = ppcg_extract_base_name(name, input); + + strcpy(name + len, "_host.cu"); + info->host_c = fopen(name, "w"); + + strcpy(name + len, "_kernel.cu"); + info->kernel_c = fopen(name, "w"); + + strcpy(name + len, "_kernel.hu"); + info->kernel_h = fopen(name, "w"); + fprintf(info->host_c, "#include \n"); + fprintf(info->host_c, "#include \n"); + fprintf(info->host_c, "#include \"%s\"\n", name); + fprintf(info->kernel_c, "#include \"%s\"\n", name); + fprintf(info->kernel_h, "#include \"cuda.h\"\n\n"); +} + +/* Close all output files. + */ +void cuda_close_files(struct cuda_info *info) +{ + fclose(info->kernel_c); + fclose(info->kernel_h); + fclose(info->host_c); +} diff --git a/polly/lib/External/ppcg/external.c b/polly/lib/External/ppcg/external.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/external.c @@ -0,0 +1,192 @@ +#include +#include +#include +#include +#include "cpu.h" +#include "opencl.h" + + +#define die() { \ + fprintf(stderr, "Dummy function %s called\n", __FUNCTION__); \ + abort(); \ +} + +__isl_give isl_union_map *pet_scop_compute_outer_to_any( + __isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_compute_outer_to_inner( + __isl_keep pet_scop *scop) { + die(); +} +enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree) { + die(); +} +int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree, + int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) { + die(); +} +isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr) { + die(); +} +isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr) { + die(); +} +isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr) { + die(); +} +__isl_give isl_union_map *pet_expr_access_get_tagged_may_read( + __isl_keep pet_expr *expr) { + die(); +} +__isl_give isl_union_map *pet_expr_access_get_tagged_may_write( + __isl_keep pet_expr *expr) { + die(); +} +__isl_give isl_union_map *pet_expr_access_get_must_write( + __isl_keep pet_expr *expr) { + die(); +} +__isl_give isl_multi_pw_aff *pet_expr_access_get_index( + __isl_keep pet_expr *expr) { + die(); +} +__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr) { + die(); +} +__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, + struct ppcg_scop *ps, struct ppcg_options *options) { + die(); +} + +__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt, + __isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr) { + die(); +} +unsigned pet_loc_get_start(__isl_keep pet_loc *loc) { + die(); +} +unsigned pet_loc_get_end(__isl_keep pet_loc *loc) { + die(); +} +int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output, + __isl_give isl_printer *(*transform)(__isl_take isl_printer *p, + __isl_take pet_scop *scop, void *user), void *user) { + die(); +} +__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop, + __isl_take isl_printer *p) { + die(); +} +__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop) { + die(); +} +__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop) { + die(); +} +int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop) { + die(); +} +int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop) { + die(); +} +int pet_tree_foreach_expr(__isl_keep pet_tree *tree, + int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) { + die(); +} +int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr, + int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) { + die(); +} +int pet_stmt_is_kill(struct pet_stmt *stmt) { + die(); +} +struct isl_args pet_options_args; +const char *ppcg_version(void) { + die(); +} +int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val) { + die(); +} +int generate_opencl(isl_ctx *ctx, struct ppcg_options *options, + const char *input, const char *output) { + die(); +} +int generate_cpu(isl_ctx *ctx, struct ppcg_options *options, + const char *input, const char *output) { + die(); +} +__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt, + __isl_keep isl_ast_build *build, + __isl_give isl_multi_pw_aff *(*fn_index)( + __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id, + void *user), void *user_index, + __isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr, + __isl_keep isl_id *id, void *user), void *user_expr) { + die(); +} +__isl_give isl_union_map *pet_scop_get_tagged_may_reads( + __isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_get_tagged_may_writes( + __isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_get_tagged_must_writes( + __isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop) { + die(); +} +__isl_give isl_union_map *pet_scop_get_tagged_must_kills( + __isl_keep pet_scop *scop) { + die(); +} +__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr) { + die(); +} +__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr, + __isl_keep const char *name) { + die(); +} +__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos) { + die(); +} +__isl_give pet_expr *pet_expr_new_cast(const char *type_name, + __isl_take pet_expr *arg) { + die(); +} +__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos, + __isl_take pet_expr *arg) { + die(); +} +__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree) { + die(); +} +__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree) { + die(); +} +__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree, + __isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user), + void *user) { + die(); +} +__isl_give isl_union_map *pet_expr_access_get_may_read( + __isl_keep pet_expr *expr) { + die(); +} +__isl_give isl_union_map *pet_expr_access_get_may_write( + __isl_keep pet_expr *expr) { + die(); +} diff --git a/polly/lib/External/ppcg/gpu.h b/polly/lib/External/ppcg/gpu.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu.h @@ -0,0 +1,459 @@ +#ifndef _GPU_H +#define _GPU_H + +#include +#include +#include + +#include + +#include "ppcg.h" +#include "ppcg_options.h" + +/* An access to an outer array element or an iterator. + * Accesses to iterators have an access relation that maps to an unnamed space. + * An access may be both read and write. + * If the access relation is empty, then the output dimension may + * not be equal to the dimension of the corresponding array. + */ +struct gpu_stmt_access { + /* Access reads elements */ + int read; + /* Access writes elements */ + int write; + /* All writes are definite writes. */ + int exact_write; + /* Is a single, fixed element being accessed? */ + isl_bool fixed_element; + /* The number of index expressions specified in the access. */ + int n_index; + + /* May access relation */ + isl_map *access; + /* May access relation with as domain a mapping from iteration domain + * to a reference identifier. + */ + isl_map *tagged_access; + /* The reference id of the corresponding pet_expr. */ + isl_id *ref_id; + + struct gpu_stmt_access *next; +}; + +/* A representation of a user statement. + * "stmt" points to the corresponding pet statement. + * "id" is the identifier of the instance set of the statement. + * "accesses" is a linked list of accesses performed by the statement. + * If the statement has been killed, i.e., if it will not be scheduled, + * then this linked list may be empty even if the actual statement does + * perform accesses. + */ +struct gpu_stmt { + isl_id *id; + struct pet_stmt *stmt; + + struct gpu_stmt_access *accesses; +}; + +/* Represents an outer array possibly accessed by a gpu_prog. + */ +struct gpu_array_info { + /* The array data space. */ + isl_space *space; + /* Element type. */ + char *type; + /* Element size. */ + int size; + /* Name of the array. */ + char *name; + /* Declared extent of original array. */ + isl_set *declared_extent; + /* AST expression for declared size of original array. */ + isl_ast_expr *declared_size; + /* Extent of the array that needs to be copied. */ + isl_set *extent; + /* Number of indices. */ + unsigned n_index; + /* For each index, a bound on "extent" in that direction. */ + isl_multi_pw_aff *bound; + /* The corresponding access AST expression, if the array needs + * to be allocated on the device. + */ + isl_ast_expr *bound_expr; + + /* All references to this array; point to elements of a linked list. */ + int n_ref; + struct gpu_stmt_access **refs; + + /* Is this array accessed at all by the program? */ + int accessed; + + /* Is this a scalar that is read-only within the entire program? */ + int read_only_scalar; + + /* Are the elements of the array structures? */ + int has_compound_element; + + /* Are the elements only accessed through constant index expressions? */ + int only_fixed_element; + + /* Is the array local to the scop? */ + int local; + /* Is the array local and should it be declared on the host? */ + int declare_local; + + /* Is the corresponding global device memory accessed in any way? */ + int global; + + /* Should the array be linearized? */ + int linearize; + + /* Order dependences on this array. + * Only used if live_range_reordering option is set. + * It is set to NULL otherwise. + */ + isl_union_map *dep_order; + + void *user; +}; + +/* Represents an outer array accessed by a ppcg_kernel, localized + * to the context of this kernel. + * + * "array" points to the corresponding array in the gpu_prog. + * The "n_group" "groups" are the reference groups associated to the array. + * If "force_private" is set, then the array (in practice a scalar) + * must be mapped to a register. + * "global" is set if the global device memory corresponding + * to this array is accessed by the kernel. + * "bound" is equal to array->bound specialized to the current kernel. + * "bound_expr" is the corresponding access AST expression. + */ +struct gpu_local_array_info { + struct gpu_array_info *array; + + int n_group; + struct gpu_array_ref_group **groups; + + int force_private; + int global; + + unsigned n_index; + isl_multi_pw_aff *bound; + isl_ast_expr *bound_expr; +}; + +__isl_give isl_ast_expr *gpu_local_array_info_linearize_index( + struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr); + +/* A sequence of "n" names of types. + */ +struct gpu_types { + int n; + char **name; +}; + +/* "read" and "write" contain the original access relations, possibly + * involving member accesses. + * + * The elements of "array", as well as the ranges of "copy_in" and "copy_out" + * only refer to the outer arrays of any possible member accesses. + */ +struct gpu_prog { + isl_ctx *ctx; + + struct ppcg_scop *scop; + + /* Set of parameter values */ + isl_set *context; + + /* All potential read accesses in the entire program */ + isl_union_map *read; + + /* All potential write accesses in the entire program */ + isl_union_map *may_write; + /* All definite write accesses in the entire program */ + isl_union_map *must_write; + /* All tagged definite kills in the entire program */ + isl_union_map *tagged_must_kill; + + /* The set of inner array elements that may be preserved. */ + isl_union_set *may_persist; + + /* A mapping from all innermost arrays to their outer arrays. */ + isl_union_map *to_outer; + /* A mapping from the outer arrays to all corresponding inner arrays. */ + isl_union_map *to_inner; + /* A mapping from all intermediate arrays to their outer arrays, + * including an identity mapping from the anonymous 1D space to itself. + */ + isl_union_map *any_to_outer; + + /* Order dependences on non-scalars. */ + isl_union_map *array_order; + + /* Array of statements */ + int n_stmts; + struct gpu_stmt *stmts; + + int n_array; + struct gpu_array_info *array; +}; + +struct gpu_gen { + isl_ctx *ctx; + struct ppcg_options *options; + + /* Callback for printing of AST in appropriate format. */ + __isl_give isl_printer *(*print)(__isl_take isl_printer *p, + struct gpu_prog *prog, __isl_keep isl_ast_node *tree, + struct gpu_types *types, void *user); + void *print_user; + + isl_id_to_ast_expr *(*build_ast_expr)(void *stmt, + isl_ast_build *build, + isl_multi_pw_aff *(*fn_index)( + __isl_take isl_multi_pw_aff *mpa, isl_id *id, + void *user), + void *user_index, + isl_ast_expr *(*fn_expr)(isl_ast_expr *expr, + isl_id *id, void *user), + void *user_expr); + + struct gpu_prog *prog; + /* The generated AST. */ + isl_ast_node *tree; + + /* The sequence of types for which a definition has been printed. */ + struct gpu_types types; + + /* User specified tile, grid and block sizes for each kernel */ + isl_union_map *sizes; + + /* Effectively used tile, grid and block sizes for each kernel */ + isl_union_map *used_sizes; + + /* Identifier of the next kernel. */ + int kernel_id; +}; + +enum ppcg_group_access_type { + ppcg_access_global, + ppcg_access_shared, + ppcg_access_private +}; + +enum ppcg_kernel_stmt_type { + ppcg_kernel_copy, + ppcg_kernel_domain, + ppcg_kernel_sync +}; + +/* Representation of special statements, in particular copy statements + * and __syncthreads statements, inside a kernel. + * + * type represents the kind of statement + * + * + * for ppcg_kernel_copy statements we have + * + * read is set if the statement should copy data from global memory + * to shared memory or registers. + * + * index expresses an access to the array element that needs to be copied + * local_index expresses the corresponding element in the tile + * + * array refers to the original array being copied + * local_array is a pointer to the appropriate element in the "array" + * array of the ppcg_kernel to which this copy access belongs + * + * + * for ppcg_kernel_domain statements we have + * + * stmt is the corresponding input statement + * + * n_access is the number of accesses in stmt + * access is an array of local information about the accesses + */ +struct ppcg_kernel_stmt { + enum ppcg_kernel_stmt_type type; + + union { + struct { + int read; + isl_ast_expr *index; + isl_ast_expr *local_index; + struct gpu_array_info *array; + struct gpu_local_array_info *local_array; + } c; + struct { + struct gpu_stmt *stmt; + isl_id_to_ast_expr *ref2expr; + } d; + } u; +}; + +/* Representation of a local variable in a kernel. + */ +struct ppcg_kernel_var { + struct gpu_array_info *array; + enum ppcg_group_access_type type; + char *name; + isl_vec *size; +}; + +/* Representation of a kernel. + * + * prog describes the original code from which the kernel is extracted. + * + * id is the sequence number of the kernel. + * + * block_ids contains the list of block identifiers for this kernel. + * thread_ids contains the list of thread identifiers for this kernel. + * + * the first n_grid elements of grid_dim represent the specified size + * of the grid. + * the first n_block elements of block_dim represent the specified or + * effective size of the block. + * Note that in the input file, the sizes of the grid and the blocks + * are specified in the order x, y, z, but internally, the sizes + * are stored in reverse order, so that the last element always + * refers to the x dimension. + * + * grid_size reflects the effective grid size. + * grid_size_expr contains a corresponding access AST expression, built within + * the context where the launch appears. + * + * context contains the values of the parameters and outer schedule dimensions + * for which any statement instance in this kernel needs to be executed. + * + * n_sync is the number of synchronization operations that have + * been introduced in the schedule tree corresponding to this kernel (so far). + * + * core contains the spaces of the statement domains that form + * the core computation of the kernel. It is used to navigate + * the tree during the construction of the device part of the schedule + * tree in gpu_create_kernel. + * + * expanded_domain contains the original statement instances, + * i.e., those that appear in the domains of access relations, + * that are involved in the kernel. + * contraction maps those original statement instances to + * the statement instances that are active at the point + * in the schedule tree where the kernel is created. + * + * arrays is the set of possibly accessed outer array elements. + * + * space is the schedule space of the AST context. That is, it represents + * the loops of the generated host code containing the kernel launch. + * + * n_array is the total number of arrays in the input program and also + * the number of element in the array array. + * array contains information about each array that is local + * to the current kernel. If an array is not used in a kernel, + * then the corresponding entry does not contain any information. + * + * any_force_private is set if any array in the kernel is marked force_private + * + * block_filter contains constraints on the domain elements in the kernel + * that encode the mapping to block identifiers, where the block identifiers + * are represented by "n_grid" parameters with as names the elements + * of "block_ids". + * + * thread_filter contains constraints on the domain elements in the kernel + * that encode the mapping to thread identifiers, where the thread identifiers + * are represented by "n_block" parameters with as names the elements + * of "thread_ids". + * + * copy_schedule corresponds to the schedule dimensions of + * the (tiled) schedule for this kernel that have been taken into account + * for computing private/shared memory tiles. + * The domain corresponds to the original statement instances, i.e., + * those that appear in the leaves of the schedule tree. + * copy_schedule_dim is the dimension of this schedule. + * + * sync_writes contains write references that require synchronization. + * Each reference is represented by a universe set in a space [S[i,j] -> R[]] + * with S[i,j] the statement instance space and R[] the array reference. + */ +struct ppcg_kernel { + isl_ctx *ctx; + struct ppcg_options *options; + + struct gpu_prog *prog; + + int id; + + isl_id_list *block_ids; + isl_id_list *thread_ids; + + int n_grid; + int n_block; + int grid_dim[2]; + int block_dim[3]; + + isl_multi_pw_aff *grid_size; + isl_ast_expr *grid_size_expr; + isl_set *context; + + int n_sync; + isl_union_set *core; + isl_union_set *arrays; + + isl_union_pw_multi_aff *contraction; + isl_union_set *expanded_domain; + + isl_space *space; + + int n_array; + struct gpu_local_array_info *array; + + int n_var; + struct ppcg_kernel_var *var; + + int any_force_private; + + isl_union_set *block_filter; + isl_union_set *thread_filter; + isl_union_pw_multi_aff *copy_schedule; + int copy_schedule_dim; + + isl_union_set *sync_writes; + + isl_ast_node *tree; +}; + +int gpu_array_is_scalar(struct gpu_array_info *array); +int gpu_array_is_read_only_scalar(struct gpu_array_info *array); +int gpu_array_requires_device_allocation(struct gpu_array_info *array); +__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array); +isl_bool gpu_array_can_be_private(struct gpu_array_info *array); + +struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop); +void *gpu_prog_free(struct gpu_prog *prog); + +int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i); + +int generate_gpu(isl_ctx *ctx, const char *input, FILE *out, + struct ppcg_options *options, + __isl_give isl_printer *(*print)(__isl_take isl_printer *p, + struct gpu_prog *prog, __isl_keep isl_ast_node *tree, + struct gpu_types *types, void *user), void *user); + +__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen, + __isl_take isl_schedule_node *node, int scale, + __isl_keep isl_multi_val *sizes); + +__isl_give isl_schedule *get_schedule(struct gpu_gen *gen); +int has_any_permutable_node(__isl_keep isl_schedule *schedule); +__isl_give isl_schedule *map_to_device(struct gpu_gen *gen, + __isl_take isl_schedule *schedule, + int to_from_device); +__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, + __isl_take isl_schedule *schedule); + +__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog); +void collect_references(struct gpu_prog *prog, struct gpu_array_info *array); +void collect_order_dependences(struct gpu_prog *prog); +isl_bool only_fixed_element_accessed(struct gpu_array_info *array); +#endif diff --git a/polly/lib/External/ppcg/gpu.c b/polly/lib/External/ppcg/gpu.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu.c @@ -0,0 +1,5849 @@ +/* + * Copyright 2010-2011 INRIA Saclay + * Copyright 2012-2013 Ecole Normale Superieure + * Copyright 2015-2016 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, + * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, + * 91893 Orsay, France + * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" +#include "gpu.h" +#include "gpu_array_tile.h" +#include "gpu_group.h" +#include "gpu_hybrid.h" +#include "gpu_tree.h" +#include "hybrid.h" +#include "schedule.h" +#include "ppcg_options.h" +#include "print.h" +#include "util.h" + +struct gpu_array_info; + +/* Return the name of the outer array (of structs) accessed by "access". + */ +static const char *get_outer_array_name(__isl_keep isl_map *access) +{ + isl_space *space; + const char *name; + + space = isl_space_range(isl_map_get_space(access)); + while (space && isl_space_is_wrapping(space)) + space = isl_space_domain(isl_space_unwrap(space)); + name = isl_space_get_tuple_name(space, isl_dim_set); + isl_space_free(space); + + return name; +} + +/* Collect all references to the given array and store pointers to them + * in array->refs. + */ +void collect_references(struct gpu_prog *prog, + struct gpu_array_info *array) +{ + int i; + int n; + + n = 0; + for (i = 0; i < prog->n_stmts; ++i) { + struct gpu_stmt *stmt = &prog->stmts[i]; + struct gpu_stmt_access *access; + + for (access = stmt->accesses; access; access = access->next) { + const char *name; + name = get_outer_array_name(access->access); + if (name && !strcmp(array->name, name)) + n++; + } + } + + array->n_ref = n; + array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n); + assert(array->refs); + + n = 0; + for (i = 0; i < prog->n_stmts; ++i) { + struct gpu_stmt *stmt = &prog->stmts[i]; + struct gpu_stmt_access *access; + + for (access = stmt->accesses; access; access = access->next) { + const char *name; + name = get_outer_array_name(access->access); + if (!name || strcmp(array->name, name)) + continue; + + array->refs[n++] = access; + } + } +} + +/* Compute and return the extent of "array", taking into account the set of + * accessed elements. + * + * In particular, the extent in the outer dimension is taken + * from "accessed", while the extents in the remaining dimensions + * are taken from array->extent. + * + * The extent in the outer dimension cannot be taken from array->extent + * because that may be unbounded. Furthermore, even if it is bounded, + * it may be larger than the piece of the array that is being accessed. + */ +static __isl_give isl_set *compute_extent(struct pet_array *array, + __isl_keep isl_set *accessed) +{ + int n_index; + isl_id *id; + isl_set *outer; + isl_set *extent; + + extent = isl_set_copy(array->extent); + + n_index = isl_set_dim(accessed, isl_dim_set); + if (n_index == 0) + return extent; + + extent = isl_set_project_out(extent, isl_dim_set, 0, 1); + outer = isl_set_copy(accessed); + outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1); + extent = isl_set_flat_product(outer, extent); + id = isl_set_get_tuple_id(accessed); + extent = isl_set_set_tuple_id(extent, id); + + return extent; +} + +/* Is the array "array" being extracted a read-only scalar? + * + * That is, is "array" a scalar that is never possibly written to. + * An array containing structures is never considered to be a scalar. + */ +static int is_read_only_scalar(struct gpu_array_info *array, + struct gpu_prog *prog) +{ + isl_set *space; + isl_union_map *write; + int empty; + + if (array->has_compound_element) + return 0; + if (array->n_index != 0) + return 0; + + write = isl_union_map_copy(prog->may_write); + space = isl_set_universe(isl_space_copy(array->space)); + write = isl_union_map_intersect_range(write, + isl_union_set_from_set(space)); + empty = isl_union_map_is_empty(write); + isl_union_map_free(write); + + return empty; +} + +/* Is "array" only accessed as individual, fixed elements? + * That is, does each access to "array" access a single, fixed element? + */ +isl_bool only_fixed_element_accessed(struct gpu_array_info *array) +{ + int i; + + for (i = 0; i < array->n_ref; ++i) + if (!array->refs[i]->fixed_element) + return isl_bool_false; + + return isl_bool_true; +} + +/* Compute bounds on the host array "pa" based on the corresponding + * accessed elements in "arrays" + * and collect all references to the array. + * Store the results in "info". + * + * If the array is zero-dimensional and does not contain structures, + * i.e., if the array is a scalar, we check whether it is read-only. + * We also check whether the array is accessed at all. + */ +static int extract_array_info(struct gpu_prog *prog, + struct gpu_array_info *info, struct pet_array *pa, + __isl_keep isl_union_set *arrays) +{ + int empty; + const char *name; + int n_index; + isl_multi_pw_aff *bounds; + isl_set *accessed, *extent; + + n_index = isl_set_dim(pa->extent, isl_dim_set); + name = isl_set_get_tuple_name(pa->extent); + + info->space = isl_set_get_space(pa->extent); + info->name = strdup(name); + info->n_index = n_index; + info->linearize = prog->scop->options->linearize_device_arrays; + + info->type = strdup(pa->element_type); + info->size = pa->element_size; + info->local = pa->declared && !pa->exposed; + info->has_compound_element = pa->element_is_record; + info->read_only_scalar = is_read_only_scalar(info, prog); + + info->declared_extent = isl_set_copy(pa->extent); + accessed = isl_union_set_extract_set(arrays, + isl_space_copy(info->space)); + empty = isl_set_is_empty(accessed); + extent = compute_extent(pa, accessed); + isl_set_free(accessed); + info->extent = extent; + if (empty < 0) + return -1; + info->accessed = !empty; + bounds = ppcg_size_from_extent(isl_set_copy(extent)); + bounds = isl_multi_pw_aff_gist(bounds, isl_set_copy(prog->context)); + if (!bounds) + return -1; + if (!isl_multi_pw_aff_is_cst(bounds)) + info->linearize = 1; + info->bound = bounds; + + collect_references(prog, info); + info->only_fixed_element = only_fixed_element_accessed(info); + + return 0; +} + +/* Remove independence from the order constraints "order" on array "array". + * Since the pairs of iterations in the filter relation of an independence + * are guaranteed to be completely independent by the user, there is + * no need to ensure that live ranges are ordered along those pairs. + * We make an exception for local variables, though, as the independence + * guarantee does not apply to those. + * + * The order constraints are used in two places. + * Those on scalars are used in check_scalar_live_ranges to check if + * we need to force the scalar to be private. Any non-local scalar + * should not be forced scalar if it only appears in independent loops. + * Those on non-scalars are added to the coincidence constraints + * in compute_schedule because we do not support any array expansion. + * Accesses to non-local arrays should not prevent a loop from being + * considered coincident so we should indeed remove those constraints + * from the order constraints. + */ +static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog, + struct gpu_array_info *array, __isl_take isl_union_map *order) +{ + // We do not have independence information in Polly. Hence, make this + // function a no-op. + return order; + int i; + + for (i = 0; i < prog->scop->pet->n_independence; ++i) { + struct pet_independence *pi = prog->scop->pet->independences[i]; + if (isl_union_set_contains(pi->local, array->space)) + continue; + + order = isl_union_map_subtract(order, + isl_union_map_copy(pi->filter)); + } + + return order; +} + +/* For each array in "prog", store the (untagged) order dependences + * derived from the array in array->dep_order. + * In particular, consider all references that access the given array + * and take the order dependences that have one of these references + * as source. (Since an order dependence relates two references to + * the same array, the target of these order dependences will also + * be one of these references.) + * Additionally, store the union of these array->dep_order relations + * for all arrays that cannot be mapped to private memory in prog->array_order. + */ +void collect_order_dependences(struct gpu_prog *prog) +{ + int i; + isl_space *space; + isl_union_map *accesses; + + space = isl_union_map_get_space(prog->read); + prog->array_order = isl_union_map_empty(space); + + accesses = isl_union_map_copy(prog->scop->tagged_reads); + accesses = isl_union_map_union(accesses, + isl_union_map_copy(prog->scop->tagged_may_writes)); + accesses = isl_union_map_universe(accesses); + accesses = isl_union_map_apply_range(accesses, + isl_union_map_copy(prog->to_outer)); + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + isl_set *set; + isl_union_set *uset; + isl_union_map *order; + + set = isl_set_universe(isl_space_copy(array->space)); + uset = isl_union_set_from_set(set); + uset = isl_union_map_domain( + isl_union_map_intersect_range(isl_union_map_copy(accesses), + uset)); + order = isl_union_map_copy(prog->scop->tagged_dep_order); + order = isl_union_map_intersect_domain(order, uset); + order = isl_union_map_zip(order); + order = isl_union_set_unwrap(isl_union_map_domain(order)); + order = remove_independences(prog, array, order); + array->dep_order = order; + + if (gpu_array_can_be_private(array)) + continue; + + prog->array_order = isl_union_map_union(prog->array_order, + isl_union_map_copy(array->dep_order)); + } + + isl_union_map_free(accesses); +} + +/* Construct a gpu_array_info for each array referenced by prog->scop and + * collect them in prog->array. + * + * The sizes are based on the extents and the set of possibly accessed + * elements by "prog". + * If there are any member accesses involved, then they are first mapped + * to the outer arrays of structs. + * Only extract gpu_array_info entries for these outer arrays. + * + * If we are allowing live range reordering, then also set + * the dep_order field. Otherwise leave it NULL. + */ +static int collect_array_info(struct gpu_prog *prog) +{ + int i; + int r = 0; + isl_union_set *arrays; + + arrays = isl_union_map_range(isl_union_map_copy(prog->read)); + arrays = isl_union_set_union(arrays, + isl_union_map_range(isl_union_map_copy(prog->may_write))); + + arrays = isl_union_set_apply(arrays, + isl_union_map_copy(prog->to_outer)); + + arrays = isl_union_set_coalesce(arrays); + + prog->n_array = prog->scop->pet->n_array; + prog->array = isl_calloc_array(prog->ctx, + struct gpu_array_info, prog->n_array); + assert(prog->array); + prog->n_array = 0; + for (i = 0; i < prog->scop->pet->n_array; ++i) { + isl_bool field; + + field = isl_set_is_wrapping(prog->scop->pet->arrays[i]->extent); + if (field < 0) + break; + if (field) + continue; + if (extract_array_info(prog, &prog->array[prog->n_array++], + prog->scop->pet->arrays[i], arrays) < 0) + r = -1; + } + if (i < prog->scop->pet->n_array) + r = -1; + + isl_union_set_free(arrays); + + if (prog->scop->options->live_range_reordering) + collect_order_dependences(prog); + + return r; +} + +static void free_array_info(struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + free(prog->array[i].type); + free(prog->array[i].name); + isl_multi_pw_aff_free(prog->array[i].bound); + isl_ast_expr_free(prog->array[i].bound_expr); + isl_space_free(prog->array[i].space); + isl_set_free(prog->array[i].declared_extent); + isl_set_free(prog->array[i].extent); + isl_ast_expr_free(prog->array[i].declared_size); + free(prog->array[i].refs); + isl_union_map_free(prog->array[i].dep_order); + } + free(prog->array); +} + +/* Check if a gpu array is a scalar. A scalar is a value that is not stored + * as an array or through a pointer reference, but as a single data element. + * At the moment, scalars are represented as zero-dimensional arrays. + * Note that the single data element may be an entire structure. + */ +int gpu_array_is_scalar(struct gpu_array_info *array) +{ + return array->n_index == 0; +} + +/* Can "array" be mapped to private memory? + * That is, is it only accessed as individual elements with + * constant index expressions? + */ +isl_bool gpu_array_can_be_private(struct gpu_array_info *array) +{ + if (!array) + return isl_bool_error; + return array->only_fixed_element; +} + +/* Is "array" a read-only scalar? + */ +int gpu_array_is_read_only_scalar(struct gpu_array_info *array) +{ + return array->read_only_scalar; +} + +/* Does "array" need to be allocated on the device? + * If it is a read-only scalar, then it will be passed as an argument + * to the kernel and therefore does not require any allocation. + * If this device memory is not accessed at all, then it does not + * need to be allocated either. + */ +int gpu_array_requires_device_allocation(struct gpu_array_info *array) +{ + if (gpu_array_is_read_only_scalar(array)) + return 0; + if (!array->global) + return 0; + return 1; +} + +/* Return the set of parameter values for which the array has a positive + * size in all dimensions. + * If the sizes are only valid for some parameter values, then those + * constraints are also taken into account. + */ +__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array) +{ + int i; + isl_space *space; + isl_set *guard; + + if (!array) + return NULL; + + space = isl_space_params(isl_space_copy(array->space)); + guard = isl_set_universe(space); + + for (i = 0; i < array->n_index; ++i) { + isl_pw_aff *bound; + isl_set *guard_i, *zero; + + bound = isl_multi_pw_aff_get_pw_aff(array->bound, i); + guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound)); + zero = isl_pw_aff_zero_set(bound); + guard_i = isl_set_subtract(guard_i, zero); + guard = isl_set_intersect(guard, guard_i); + } + + return guard; +} + +/* Internal data structure for extract_size_of_type. + * "type" specifies the name of the space that we want to extract. + * "res" is used to store the subset of that space. + */ +struct ppcg_extract_size_data { + const char *type; + isl_set *res; +}; + +/* This function is called for each set in a union_set. + * If the name of the set matches data->type, we store the + * set in data->res. + */ +static isl_stat extract_size_of_type(__isl_take isl_set *size, void *user) +{ + struct ppcg_extract_size_data *data = user; + const char *name; + + name = isl_set_get_tuple_name(size); + if (name && !strcmp(name, data->type)) { + data->res = size; + return isl_stat_error; + } + + isl_set_free(size); + return isl_stat_ok; +} + +/* Given a union map { kernel[i] -> *[...] }, + * return the range in the space called "type" for the kernel with + * sequence number "id". + */ +static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes, + const char *type, int id) +{ + isl_space *space; + isl_set *dom; + isl_union_set *local_sizes; + struct ppcg_extract_size_data data = { type, NULL }; + + if (!sizes) + return NULL; + + space = isl_union_map_get_space(sizes); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, 1); + space = isl_space_set_tuple_name(space, isl_dim_set, "kernel"); + dom = isl_set_universe(space); + dom = isl_set_fix_si(dom, isl_dim_set, 0, id); + + local_sizes = isl_union_set_apply(isl_union_set_from_set(dom), + isl_union_map_copy(sizes)); + isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data); + isl_union_set_free(local_sizes); + return data.res; +} + +/* Given a singleton set, extract the first (at most *len) elements + * of the single integer tuple into *sizes and update *len if needed. + */ +static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len) +{ + int i; + int dim; + + if (!set) + return; + + dim = isl_set_dim(set, isl_dim_set); + if (dim < *len) + *len = dim; + + for (i = 0; i < *len; ++i) { + isl_val *v; + + v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i); + assert(v); + + sizes[i] = isl_val_get_num_si(v); + isl_val_free(v); + } + + isl_set_free(set); +} + +/* Add the map { kernel[id] -> type[sizes] } to gen->used_sizes, + * if the option debug->dump_sizes is set. + */ +static void set_used_sizes(struct gpu_gen *gen, const char *type, int id, + int *sizes, int len) +{ + int i; + isl_space *space; + isl_map *map; + + if (!gen->options->debug->dump_sizes) + return; + + space = isl_union_map_get_space(gen->used_sizes); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, 1); + space = isl_space_set_tuple_name(space, isl_dim_set, "kernel"); + space = isl_space_from_domain(space); + space = isl_space_add_dims(space, isl_dim_out, len); + space = isl_space_set_tuple_name(space, isl_dim_out, type); + + map = isl_map_universe(space); + map = isl_map_fix_si(map, isl_dim_in, 0, id); + for (i = 0; i < len; ++i) + map = isl_map_fix_si(map, isl_dim_out, i, sizes[i]); + + gen->used_sizes = isl_union_map_add_map(gen->used_sizes, map); +} + +/* Extract user specified "tile" sizes from the "sizes" command line option, + * defaulting to option->tile_size in each dimension. + * *tile_len contains the maximum number of tile sizes needed. + * Update *tile_len to the number of specified tile sizes, if any, and + * return a pointer to the tile sizes (or NULL on error). + * Add the effectively used sizes to gen->used_sizes. + */ +static int *read_tile_sizes(struct gpu_gen *gen, int *tile_len) +{ + int n; + int *tile_size; + isl_set *size; + + tile_size = isl_alloc_array(gen->ctx, int, *tile_len); + if (!tile_size) + return NULL; + for (n = 0; n < *tile_len; ++n) + tile_size[n] = gen->options->tile_size; + + size = extract_sizes(gen->sizes, "tile", gen->kernel_id); + read_sizes_from_set(size, tile_size, tile_len); + set_used_sizes(gen, "tile", gen->kernel_id, tile_size, *tile_len); + + return tile_size; +} + +/* Extract user specified "block" sizes from the "sizes" command line option, + * after filling in some potentially useful defaults. + */ +static void read_block_sizes(struct ppcg_kernel *kernel, + __isl_keep isl_union_map *sizes) +{ + isl_set *size; + + if (kernel->n_block > 3) + kernel->n_block = 3; + switch (kernel->n_block) { + case 1: + kernel->block_dim[0] = 512; + break; + case 2: + kernel->block_dim[0] = 32; + kernel->block_dim[1] = 16; + break; + default: + kernel->block_dim[0] = 32; + kernel->block_dim[1] = 4; + kernel->block_dim[2] = 4; + break; + } + + size = extract_sizes(sizes, "block", kernel->id); + read_sizes_from_set(size, kernel->block_dim, &kernel->n_block); +} + +/* Extract user specified "grid" sizes from the "sizes" command line option, + * after filling in some potentially useful defaults. + */ +static void read_grid_sizes(struct ppcg_kernel *kernel, + __isl_keep isl_union_map *sizes) +{ + isl_set *size; + + if (kernel->n_grid > 2) + kernel->n_grid = 2; + switch (kernel->n_grid) { + case 1: + kernel->grid_dim[0] = 32768; + break; + default: + kernel->grid_dim[0] = 256; + kernel->grid_dim[1] = 256; + break; + } + + size = extract_sizes(sizes, "grid", kernel->id); + read_sizes_from_set(size, kernel->grid_dim, &kernel->n_grid); +} + +/* Extract user specified grid and block sizes from the gen->sizes + * command line option after filling in some potentially useful defaults. + * Store the extracted sizes in "kernel". + * Add the effectively used sizes to gen->used_sizes. + */ +static void read_grid_and_block_sizes(struct ppcg_kernel *kernel, + struct gpu_gen *gen) +{ + read_block_sizes(kernel, gen->sizes); + read_grid_sizes(kernel, gen->sizes); + set_used_sizes(gen, "block", kernel->id, + kernel->block_dim, kernel->n_block); + set_used_sizes(gen, "grid", kernel->id, + kernel->grid_dim, kernel->n_grid); +} + +static void *free_stmts(struct gpu_stmt *stmts, int n) +{ + int i; + + if (!stmts) + return NULL; + + for (i = 0; i < n; ++i) { + struct gpu_stmt_access *access, *next; + + for (access = stmts[i].accesses; access; access = next) { + next = access->next; + isl_id_free(access->ref_id); + isl_map_free(access->access); + isl_map_free(access->tagged_access); + free(access); + } + + isl_id_free(stmts[i].id); + } + free(stmts); + + return NULL; +} + +/* Add parameters p[i] with identifiers "ids" to "set", + * with bounds to 0 <= p[i] < size[i]. + */ +__isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set, + int *size, __isl_keep isl_id_list *ids) +{ + int i, len; + unsigned nparam; + + len = isl_id_list_n_id(ids); + nparam = isl_set_dim(set, isl_dim_param); + set = isl_set_add_dims(set, isl_dim_param, len); + + for (i = 0; i < len; ++i) { + isl_id *id; + + id = isl_id_list_get_id(ids, i); + set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id); + set = isl_set_lower_bound_si(set, isl_dim_param, nparam + i, 0); + set = isl_set_upper_bound_si(set, isl_dim_param, + nparam + i, size[i] - 1); + } + + return set; +} + +/* Add "len" parameters p[i] with identifiers "ids" and intersect "set" + * with + * + * { : 0 <= p[i] < size[i] } + * + * or an overapproximation. + */ +static __isl_give isl_set *add_bounded_parameters_dynamic( + __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size, + __isl_keep isl_id_list *ids) +{ + int i, len; + unsigned nparam; + isl_space *space; + isl_local_space *ls; + + len = isl_multi_pw_aff_dim(size, isl_dim_out); + nparam = isl_set_dim(set, isl_dim_param); + set = isl_set_add_dims(set, isl_dim_param, len); + + for (i = 0; i < len; ++i) { + isl_id *id; + + id = isl_id_list_get_id(ids, i); + set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id); + } + + space = isl_space_params(isl_set_get_space(set)); + ls = isl_local_space_from_space(space); + for (i = 0; i < len; ++i) { + isl_pw_aff *param, *size_i, *zero; + isl_set *bound; + + param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls), + isl_dim_param, nparam + i); + + size_i = isl_multi_pw_aff_get_pw_aff(size, i); + bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i); + bound = isl_set_from_basic_set(isl_set_simple_hull(bound)); + set = isl_set_intersect_params(set, bound); + + zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls)); + bound = isl_pw_aff_ge_set(param, zero); + set = isl_set_intersect_params(set, bound); + } + isl_local_space_free(ls); + + return set; +} + +/* Return the union of all tagged access relations in the group. + */ +static __isl_give isl_union_map *group_tagged_access_relation( + struct gpu_array_ref_group *group) +{ + int i; + isl_union_map *access; + + access = isl_union_map_empty(isl_map_get_space(group->access)); + for (i = 0; i < group->n_ref; ++i) { + isl_map *map_i; + + map_i = isl_map_copy(group->refs[i]->tagged_access); + access = isl_union_map_union(access, + isl_union_map_from_map(map_i)); + } + + return access; +} + +/* Return the extent of "array", recomputed from the bounds. + * The recomputed extent may be simpler than the original extent. + */ +static __isl_give isl_set *array_extent(struct gpu_array_info *array) +{ + int i; + isl_id *id; + isl_space *space; + isl_local_space *ls; + isl_set *extent; + + id = isl_set_get_tuple_id(array->extent); + space = isl_set_get_space(array->extent); + extent = isl_set_universe(isl_space_copy(space)); + ls = isl_local_space_from_space(space); + for (i = 0; i < array->n_index; ++i) { + isl_pw_aff *bound; + isl_aff *aff; + isl_pw_aff *index; + isl_set *lt; + + extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0); + + aff = isl_aff_var_on_domain(isl_local_space_copy(ls), + isl_dim_set, i); + index = isl_pw_aff_from_aff(aff); + bound = isl_multi_pw_aff_get_pw_aff(array->bound, i); + bound = isl_pw_aff_from_range(bound); + bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index); + bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in, + isl_id_copy(id)); + lt = isl_pw_aff_lt_set(index, bound); + extent = isl_set_intersect(extent, lt); + } + isl_local_space_free(ls); + isl_id_free(id); + + return extent; +} + +/* Return a map from the first group->shared_tile->depth dimensions + * of the computed schedule to the array tile in + * global memory that corresponds to the shared memory copy. + * + * In particular, return a map + * + * { D[i] -> A[a] } + * + * with constraints + * + * tile_offset(i) <= a <= tile_offset(i) + tile_size - 1 (1) + * + * and + * + * 0 <= a <= array_size - 1 (2) + * + * Note that if some stride has been detected (i.e., when + * group->shared_tile->bound[i].shift is set), then a in (1) refers + * to the shifted and scaled down version. + * + * Constraints (1) are obtained by mapping the size constraints on the + * shared/private memory tile back to the access relation. + * Constraints (2) are obtained from the (recomputed) extent. + */ +static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group) +{ + int i; + int n_index = group->array->n_index; + isl_map *tile; + isl_space *space; + isl_set *local; + isl_set *extent; + + space = isl_multi_aff_get_space(group->shared_tile->tiling); + space = isl_space_range(space); + local = isl_set_universe(space); + for (i = 0; i < n_index; ++i) { + isl_val *bound; + + local = isl_set_lower_bound_si(local, isl_dim_set, i, 0); + bound = isl_val_copy(group->shared_tile->bound[i].size); + bound = isl_val_sub_ui(bound, 1); + local = isl_set_upper_bound_val(local, isl_dim_set, i, bound); + } + local = isl_set_preimage_multi_aff(local, + isl_multi_aff_copy(group->shared_tile->tiling)); + tile = isl_set_unwrap(local); + extent = array_extent(group->array); + tile = isl_map_intersect_range(tile, extent); + + return tile; +} + +/* Given a mapping "iterator_map" from the AST schedule to a domain, + * return the corresponding mapping from the AST schedule to + * to the outer kernel->copy_schedule_dim dimensions of + * the schedule computed by PPCG for this kernel. + * + * Note that kernel->copy_schedule_dim is at least as large as + * the largest depth of any array reference group associated to the kernel. + * This is needed as the returned schedule is used to extract a mapping + * to the outer tile->depth dimensions in transform_index. + */ +static __isl_give isl_pw_multi_aff *compute_sched_to_copy( + struct ppcg_kernel *kernel, __isl_take isl_pw_multi_aff *iterator_map) +{ + isl_union_pw_multi_aff *upma; + isl_pw_multi_aff *pma; + isl_space *space; + + space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map)); + space = isl_space_from_domain(space); + space = isl_space_add_dims(space, isl_dim_out, + kernel->copy_schedule_dim); + + upma = isl_union_pw_multi_aff_copy(kernel->copy_schedule); + pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space); + isl_union_pw_multi_aff_free(upma); + + return isl_pw_multi_aff_pullback_pw_multi_aff(pma, iterator_map); +} + +/* If max_shared_memory is not set to infinity (-1), then make + * sure that the total amount of shared memory required by the + * array reference groups mapped to shared memory by "kernel" + * is no larger than this maximum. + * + * We apply a greedy approach and discard (keep in global memory) + * those groups that would result in a total memory size that + * is larger than the maximum. + * + * This function should be called after any function that may + * affect the decision on whether to place a reference group + * in private, shared or global memory. + */ +static void check_shared_memory_bound(struct ppcg_kernel *kernel) +{ + int i, j; + isl_val *left, *size; + + if (kernel->options->max_shared_memory < 0) + return; + + left = isl_val_int_from_si(kernel->ctx, + kernel->options->max_shared_memory); + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *local = &kernel->array[i]; + + for (j = 0; j < local->n_group; ++j) { + struct gpu_array_ref_group *group; + enum ppcg_group_access_type type; + + group = local->groups[j]; + type = gpu_array_ref_group_type(group); + if (type != ppcg_access_shared) + continue; + + size = gpu_array_tile_size(group->shared_tile); + size = isl_val_mul_ui(size, local->array->size); + + if (isl_val_le(size, left)) { + left = isl_val_sub(left, size); + continue; + } + isl_val_free(size); + + group->shared_tile = + gpu_array_tile_free(group->shared_tile); + } + } + + isl_val_free(left); +} + +/* Mark all arrays of "kernel" that have an array reference group + * that is not mapped to private or shared memory as + * accessing the corresponding global device memory. + */ +static void mark_global_arrays(struct ppcg_kernel *kernel) +{ + int i, j; + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *local = &kernel->array[i]; + + if (local->global) + continue; + for (j = 0; j < local->n_group; ++j) { + if (gpu_array_ref_group_tile(local->groups[j])) + continue; + + local->global = 1; + local->array->global = 1; + break; + } + } +} + +/* Compute a tiling for all the array reference groups in "kernel". + */ +static void compute_group_tilings(struct ppcg_kernel *kernel) +{ + int i, j; + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *array = &kernel->array[i]; + + for (j = 0; j < array->n_group; ++j) + gpu_array_ref_group_compute_tiling(array->groups[j]); + } +} + +/* Compute the effective grid size as a list of the sizes in each dimension. + * + * The grid size specified by the user or set by default + * in read_grid_sizes() and applied by the block filter, + * may be too large for the given code in the sense that + * it may contain blocks that don't need to execute anything. + * We therefore don't return this grid size, but instead the + * smallest grid size that ensures that all blocks that actually + * execute code are included in the grid. + * + * We first extract a description of the grid, i.e., the possible values + * of the block ids, from the domain elements in "domain" and + * kernel->block_filter. + * The block ids are parameters in kernel->block_filter. + * We simply need to change them into set dimensions. + * + * Then, for each block dimension, we compute the maximal value of the block id + * and add one. + */ +static __isl_give isl_multi_pw_aff *extract_grid_size( + struct ppcg_kernel *kernel, __isl_take isl_union_set *domain) +{ + int i; + isl_set *grid; + isl_set *context; + isl_multi_pw_aff *size; + + domain = isl_union_set_intersect(domain, + isl_union_set_copy(kernel->block_filter)); + grid = isl_union_set_params(domain); + grid = isl_set_from_params(grid); + grid = isl_set_add_dims(grid, isl_dim_set, kernel->n_grid); + for (i = 0; i < kernel->n_grid; ++i) { + int pos; + isl_id *id; + + id = isl_id_list_get_id(kernel->block_ids, i); + pos = isl_set_find_dim_by_id(grid, isl_dim_param, id); + isl_id_free(id); + assert(pos >= 0); + grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i); + grid = isl_set_project_out(grid, isl_dim_param, pos, 1); + } + + grid = isl_set_coalesce(grid); + size = ppcg_size_from_extent(grid); + context = isl_set_params(isl_set_copy(kernel->context)); + return isl_multi_pw_aff_gist(size, context); +} + +/* Compute the size of a fixed bounding box around the origin and "set", + * where "set" is assumed to contain only non-negative elements, + * and store the results in "size". + * In particular, compute the maximal value of "set" in each direction + * and add one. + */ +static void extract_fixed_size(__isl_take isl_set *set, int *size) +{ + int i, n; + isl_local_space *ls; + isl_aff *obj; + + n = isl_set_dim(set, isl_dim_set); + ls = isl_local_space_from_space(isl_set_get_space(set)); + obj = isl_aff_zero_on_domain(ls); + for (i = 0; i < n; ++i) { + isl_val *max; + + obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1); + max = isl_set_max_val(set, obj); + size[i] = isl_val_get_num_si(max) + 1; + isl_val_free(max); + obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0); + } + isl_aff_free(obj); + isl_set_free(set); +} + +/* Compute the effective block size as a list of the sizes in each dimension + * and store the sizes in kernel->block_dim. + * + * The block size specified by the user or set by default + * in read_block_sizes() and applied by the thread filter, + * may be too large for the given code in the sense that + * it may contain threads that don't need to execute anything. + * We therefore update this block size in kernel->block_dim + * to the smallest block size that ensures that all threads + * that actually execute code are included in the block. + * + * The set of possible values of the thread ids is obtained from + * the domain elements "domain" and kernel->thread_filter. + * The current implementation eliminates all parameters, ensuring + * that the size is a fixed constant in each dimension. + * In principle we could also compute parametric sizes. + * We would have to make sure to project out all b%d and t%d parameters, + * however. + */ +static isl_stat extract_block_size(struct ppcg_kernel *kernel, + __isl_take isl_union_set *domain) +{ + int i; + int nparam; + isl_set *block; + + domain = isl_union_set_intersect(domain, + isl_union_set_copy(kernel->thread_filter)); + block = isl_union_set_params(domain); + block = isl_set_from_params(block); + block = isl_set_add_dims(block, isl_dim_set, kernel->n_block); + for (i = 0; i < kernel->n_block; ++i) { + int pos; + isl_id *id; + + if (!block) + return isl_stat_error; + + id = isl_id_list_get_id(kernel->thread_ids, i); + pos = isl_set_find_dim_by_id(block, isl_dim_param, id); + isl_id_free(id); + if (pos < 0) + isl_die(isl_set_get_ctx(block), isl_error_internal, + "missing constraints on thread identifier", + block = isl_set_free(block)); + block = isl_set_equate(block, isl_dim_param, pos, + isl_dim_set, i); + } + nparam = isl_set_dim(block, isl_dim_param); + block = isl_set_project_out(block, isl_dim_param, 0, nparam); + + if (!block) + return isl_stat_error; + + extract_fixed_size(block, kernel->block_dim); + + return isl_stat_ok; +} + +struct ppcg_kernel *ppcg_kernel_free(struct ppcg_kernel *kernel) +{ + int i, j; + + if (!kernel) + return NULL; + + isl_id_list_free(kernel->block_ids); + isl_id_list_free(kernel->thread_ids); + isl_multi_pw_aff_free(kernel->grid_size); + isl_ast_expr_free(kernel->grid_size_expr); + isl_set_free(kernel->context); + isl_union_set_free(kernel->core); + isl_union_set_free(kernel->arrays); + isl_union_pw_multi_aff_free(kernel->contraction); + isl_union_set_free(kernel->expanded_domain); + isl_space_free(kernel->space); + isl_ast_node_free(kernel->tree); + isl_union_set_free(kernel->block_filter); + isl_union_set_free(kernel->thread_filter); + isl_union_pw_multi_aff_free(kernel->copy_schedule); + isl_union_set_free(kernel->sync_writes); + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *array = &kernel->array[i]; + + for (j = 0; j < array->n_group; ++j) + gpu_array_ref_group_free(array->groups[j]); + free(array->groups); + + isl_multi_pw_aff_free(array->bound); + isl_ast_expr_free(array->bound_expr); + } + free(kernel->array); + + for (i = 0; i < kernel->n_var; ++i) { + free(kernel->var[i].name); + isl_vec_free(kernel->var[i].size); + } + free(kernel->var); + + free(kernel); + + return NULL; +} + +/* Wrapper around ppcg_kernel_free for use as a isl_id_set_free_user callback. + */ +static void ppcg_kernel_free_wrap(void *user) +{ + struct ppcg_kernel *kernel = user; + + ppcg_kernel_free(kernel); +} + +static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group, + struct ppcg_kernel_var *var) +{ + int j; + struct gpu_array_tile *tile; + isl_printer *p; + + var->array = group->array; + + var->type = gpu_array_ref_group_type(group); + tile = gpu_array_ref_group_tile(group); + + p = isl_printer_to_str(ctx); + p = gpu_array_ref_group_print_name(group, p); + var->name = isl_printer_get_str(p); + isl_printer_free(p); + + var->size = isl_vec_alloc(ctx, group->array->n_index); + + for (j = 0; j < group->array->n_index; ++j) + var->size = isl_vec_set_element_val(var->size, j, + isl_val_copy(tile->bound[j].size)); +} + +static int create_kernel_vars(struct ppcg_kernel *kernel) +{ + int i, j, n; + + n = 0; + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *array = &kernel->array[i]; + + for (j = 0; j < array->n_group; ++j) { + struct gpu_array_ref_group *group = array->groups[j]; + enum ppcg_group_access_type type; + + type = gpu_array_ref_group_type(group); + if (type != ppcg_access_global) + ++n; + } + } + + kernel->n_var = n; + kernel->var = isl_calloc_array(kernel->ctx, struct ppcg_kernel_var, n); + if (!kernel->var) + return -1; + + n = 0; + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *array = &kernel->array[i]; + + for (j = 0; j < array->n_group; ++j) { + struct gpu_array_ref_group *group = array->groups[j]; + enum ppcg_group_access_type type; + + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_global) + continue; + create_kernel_var(kernel->ctx, group, &kernel->var[n]); + ++n; + } + } + + return 0; +} + +/* Replace "pa" by the zero function defined over the universe domain + * in the space of "pa". + */ +static __isl_give isl_pw_aff *set_universally_zero(__isl_take isl_pw_aff *pa) +{ + isl_space *space; + isl_aff *zero; + + space = isl_space_domain(isl_pw_aff_get_space(pa)); + isl_pw_aff_free(pa); + zero = isl_aff_zero_on_domain(isl_local_space_from_space(space)); + + return isl_pw_aff_from_aff(zero); +} + +/* The sizes of the arrays on the host that have been computed by + * extract_array_info may depend on the parameters. Use the extra + * constraints on the parameters that are valid at "host_domain" + * to simplify these expressions and store the results in kernel->array. + * + * We only need these localized bounds for arrays that are accessed + * by the current kernel. If we have found at least one reference group + * then the array is accessed by the kernel. + * + * The resulting sizes may be functions that are nowhere defined + * in case the access function cannot possibly access anything inside + * the kernel for some reason. If so, they are replaced by the zero + * function. Since the access function cannot actually access anything, + * there is no harm in printing the array sizes as zero. + */ +static void localize_bounds(struct ppcg_kernel *kernel, + __isl_keep isl_set *host_domain) +{ + int i, j; + isl_set *context; + + context = isl_set_copy(host_domain); + context = isl_set_params(context); + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *local = &kernel->array[i]; + isl_multi_pw_aff *bound; + int n_index; + + if (local->n_group == 0) + continue; + + n_index = local->array->n_index; + bound = isl_multi_pw_aff_copy(local->array->bound); + + for (j = 0; j < n_index; ++j) { + isl_pw_aff *pwaff; + int empty; + + pwaff = isl_multi_pw_aff_get_pw_aff(bound, j); + pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context)); + empty = isl_pw_aff_is_empty(pwaff); + if (empty < 0) + pwaff = isl_pw_aff_free(pwaff); + else if (empty) + pwaff = set_universally_zero(pwaff); + bound = isl_multi_pw_aff_set_pw_aff(bound, j, pwaff); + } + + local->n_index = n_index; + local->bound = bound; + } + isl_set_free(context); +} + +/* Create the array of gpu_local_array_info structures "array" + * inside "kernel". The number of elements in this array is + * the same as the number of arrays in "prog". + * Initialize the "array" field of each local array to point + * to the corresponding array in "prog". + */ +static struct ppcg_kernel *ppcg_kernel_create_local_arrays( + struct ppcg_kernel *kernel, struct gpu_prog *prog) +{ + int i; + isl_ctx *ctx; + + ctx = isl_set_get_ctx(prog->context); + kernel->array = isl_calloc_array(ctx, + struct gpu_local_array_info, prog->n_array); + if (!kernel->array) + return ppcg_kernel_free(kernel); + kernel->n_array = prog->n_array; + + for (i = 0; i < prog->n_array; ++i) + kernel->array[i].array = &prog->array[i]; + + return kernel; +} + +/* Does "kernel" need to be passed an argument corresponding to array "i"? + * + * The argument is only needed if the kernel accesses this device memory. + */ +int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i) +{ + return kernel->array[i].global; +} + +/* Find the element in gen->stmt that has the given "id". + * Return NULL if no such gpu_stmt can be found. + */ +static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id) +{ + int i; + + for (i = 0; i < prog->n_stmts; ++i) { + if (id == prog->stmts[i].id) + break; + } + + return i < prog->n_stmts ? &prog->stmts[i] : NULL; +} + +void ppcg_kernel_stmt_free(void *user) +{ + struct ppcg_kernel_stmt *stmt = user; + + if (!stmt) + return; + + switch (stmt->type) { + case ppcg_kernel_copy: + isl_ast_expr_free(stmt->u.c.index); + isl_ast_expr_free(stmt->u.c.local_index); + break; + case ppcg_kernel_domain: + isl_id_to_ast_expr_free(stmt->u.d.ref2expr); + break; + case ppcg_kernel_sync: + break; + } + + free(stmt); +} + +/* Return the gpu_stmt_access in the list "accesses" that corresponds + * to "ref_id". + */ +static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses, + __isl_keep isl_id *ref_id) +{ + struct gpu_stmt_access *access; + + for (access = accesses; access; access = access->next) + if (access->ref_id == ref_id) + return access; + + return NULL; +} + +/* Return the index of the array called "name" in the list of arrays. + */ +static int find_array_index(struct ppcg_kernel *kernel, const char *name) +{ + int i; + + for (i = 0; i < kernel->n_array; ++i) + if (!strcmp(name, kernel->array[i].array->name)) + return i; + + return -1; +} + +/* Internal data structure for the index and AST expression transformation + * callbacks for pet_stmt_build_ast_exprs. + * + * "kernel" is the kernel for which are computing AST expressions and + * may be NULL if we are not inside a kernel. + * "accesses" is the list of gpu_stmt_access in the statement. + * "iterator_map" expresses the statement iterators in terms of + * the AST loop iterators. + * "sched2copy" expresses the outer copy_schedule_dim dimensions of + * the kernel schedule in terms of the AST loop iterators and + * may be NULL if we are not inside a kernel. + * + * The following fields are set in transform_index and used in transform_expr. + * "array" is the array that is being accessed. + * "global" is set if the global array is accessed (rather than + * shared/private memory). + * "local_array" refers to information on the array specialized + * to the current kernel. + */ +struct ppcg_transform_data { + struct ppcg_options *options; + struct ppcg_kernel *kernel; + struct gpu_stmt_access *accesses; + isl_pw_multi_aff *iterator_map; + isl_pw_multi_aff *sched2copy; + + struct gpu_array_info *array; + int global; + struct gpu_local_array_info *local_array; +}; + +/* Return a pointer to the gpu_array_ref_group in "local" + * that contains the reference "access". + * Return NULL if no such group can be found. + */ +static struct gpu_array_ref_group *find_ref_group( + struct gpu_local_array_info *local, struct gpu_stmt_access *access) +{ + int i, j; + + for (i = 0; i < local->n_group; ++i) { + struct gpu_array_ref_group *group = local->groups[i]; + + for (j = 0; j < group->n_ref; ++j) + if (group->refs[j] == access) + return group; + } + + return NULL; +} + +/* Given an index expression "index" of the form + * + * L -> F(A), + * + * with F(A) either A or some subfield of A and L the AST loop iterators, + * and a tiling "tiling" of the form + * + * [L -> A] -> T + * + * apply the tiling to the outer array in the index expression to obtain + * + * L -> T(A) + * + * If F(A) is some subfield of A, then separate the member access + * into the base index expression and the field index expression, + * apply the tiling to the base index expression and combine the result + * with the field index expression. + * + * If F(A) is A, then modify index to keep track of the iterators + * + * L -> [L -> A] + * + * and combine the result with the tiling to obtain a tiled index expression + * in terms of the AST loop iterators + * + * L -> T + */ +static __isl_give isl_multi_pw_aff *tile_outer( + __isl_take isl_multi_pw_aff *index, __isl_take isl_multi_pw_aff *tiling) +{ + isl_bool is_wrapping; + isl_space *space; + isl_multi_pw_aff *mpa; + + is_wrapping = isl_multi_pw_aff_range_is_wrapping(index); + if (is_wrapping < 0) + goto error; + if (is_wrapping) { + isl_multi_pw_aff *field; + + field = isl_multi_pw_aff_copy(index); + field = isl_multi_pw_aff_range_factor_range(field); + index = isl_multi_pw_aff_range_factor_domain(index); + index = tile_outer(index, tiling); + return isl_multi_pw_aff_range_product(index, field); + } + + space = isl_space_domain(isl_multi_pw_aff_get_space(index)); + space = isl_space_map_from_set(space); + mpa = isl_multi_pw_aff_identity(space); + index = isl_multi_pw_aff_range_product(mpa, index); + index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index); + + return index; +error: + isl_multi_pw_aff_free(index); + isl_multi_pw_aff_free(tiling); + return NULL; +} + +/* Index transformation callback for pet_stmt_build_ast_exprs. + * + * "index" expresses the array indices in terms of statement iterators + * + * We first reformulate "index" in terms of the AST loop iterators. + * Then we check if we are accessing the global array or + * a shared/private copy. In particular, if we are not inside a kernel + * then we must be accessing a global array. + * In the former case, we simply return + * the updated index. If "index" is an affine expression rather + * than an array access, then we also return the updated index here. + * + * If no reference groups have been computed for the array, + * then we can only be accessing the global array. + * + * Otherwise, we apply the tiling to the index. + * This tiling is of the form + * + * [D -> A] -> T + * + * where D corresponds to the outer tile->depth dimensions of + * the kernel schedule. + * The index is of the form + * + * L -> A + * + * We update the tiling to refer to the AST loop iterators + * + * [L -> A] -> T + * + * and combine it with the index to obtain a tiled index expression in terms + * of the AST loop iterators + * + * L -> T + * + * Note that while the tiling applies directly to an outer array. + * the index may refer to some subfield of this outer array. + * In such cases, the result will refer to the same subfield of the tile. + * That is, an index expression of the form L -> F(A) will be transformed + * into an index expression of the form L -> F(T). + */ +static __isl_give isl_multi_pw_aff *transform_index( + __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id, + void *user) +{ + struct ppcg_transform_data *data = user; + struct gpu_stmt_access *access; + struct gpu_array_ref_group *group; + struct gpu_array_tile *tile; + isl_pw_multi_aff *iterator_map; + int i; + int dim; + const char *name; + isl_space *space; + isl_multi_pw_aff *tiling; + isl_pw_multi_aff *pma; + isl_pw_multi_aff *sched2depth; + + data->array = NULL; + + iterator_map = isl_pw_multi_aff_copy(data->iterator_map); + index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map); + + if (!data->kernel) + return index; + + access = find_access(data->accesses, ref_id); + if (!access) + return index; + if (!isl_map_has_tuple_name(access->access, isl_dim_out)) + return index; + + name = get_outer_array_name(access->access); + i = find_array_index(data->kernel, name); + if (i < 0) + isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal, + "cannot find array", + return isl_multi_pw_aff_free(index)); + data->local_array = &data->kernel->array[i]; + data->array = data->local_array->array; + + group = find_ref_group(data->local_array, access); + if (!group) { + data->global = 1; + return index; + } + + tile = gpu_array_ref_group_tile(group); + data->global = !tile; + if (!tile) + return index; + + space = isl_space_domain(isl_multi_aff_get_space(tile->tiling)); + space = isl_space_range(isl_space_unwrap(space)); + space = isl_space_map_from_set(space); + pma = isl_pw_multi_aff_identity(space); + sched2depth = isl_pw_multi_aff_copy(data->sched2copy); + dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out); + sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out, + tile->depth, dim - tile->depth); + pma = isl_pw_multi_aff_product(sched2depth, pma); + tiling = isl_multi_pw_aff_from_multi_aff( + isl_multi_aff_copy(tile->tiling)); + tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma); + + index = tile_outer(index, tiling); + + return index; +} + +/* Dereference "expr" by adding an index [0]. + * The original "expr" is assumed not to have any indices. + * + * If "expr" is a member access, then the dereferencing needs + * to be applied to the structure argument of this member access. + */ +static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr) +{ + isl_ctx *ctx; + isl_ast_expr *arg0, *res; + isl_ast_expr_list *list; + + arg0 = isl_ast_expr_get_op_arg(expr, 0); + if (!arg0) + return isl_ast_expr_free(expr); + if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op && + isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) { + isl_ast_expr *arg; + + arg = isl_ast_expr_get_op_arg(arg0, 0); + arg = dereference(arg); + arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg); + expr = isl_ast_expr_set_op_arg(expr, 0, arg0); + + return expr; + } + isl_ast_expr_free(arg0); + + ctx = isl_ast_expr_get_ctx(expr); + res = isl_ast_expr_from_val(isl_val_zero(ctx)); + list = isl_ast_expr_list_from_ast_expr(res); + res = isl_ast_expr_get_op_arg(expr, 0); + res = isl_ast_expr_access(res, list); + isl_ast_expr_free(expr); + + return res; +} + +/* Linearize the index expression "expr" based on the array bounds + * of "array". + * + * That is, transform expression + * + * A[i_0][i_1]...[i_n] + * + * to + * + * A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n] + * + * where b_0, b_1, ..., b_n are the bounds on the array. + * + * If the base of "expr" is a member access, then the linearization needs + * to be applied to the structure argument of this member access. + * + * In the base case, if "expr" has no arguments (other than the name of + * the array), then we are passing an entire array to a function. + * In this case, there is nothing to linearize. + * Note that at this point an expression with no arguments can + * only be an entire array because the scalar case and + * the case of single struct are handled by the caller. + * + * If the number of specified index expressions in "expr" + * is smaller than the dimension of the accessed array, + * then the missing i_j also do not appear in the linearized expression. + * Furthermore, since such an expression does not refer to a single + * element while the default linearized expression would refer to + * a single element, we return the expression + * + * A + (..((i_0 * b_1 + i_1) ... ) * b_l + i_l) + * + * instead. Note that because of the special case handling above, + * we can assume here that there is at least one index expression. + */ +__isl_give isl_ast_expr *gpu_local_array_info_linearize_index( + struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr) +{ + int i, n; + isl_ast_expr *arg0; + isl_ast_expr *res; + isl_ast_expr_list *list; + + arg0 = isl_ast_expr_get_op_arg(expr, 0); + if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op && + isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) { + isl_ast_expr *arg; + + arg = isl_ast_expr_get_op_arg(arg0, 0); + arg = gpu_local_array_info_linearize_index(array, arg); + arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg); + expr = isl_ast_expr_set_op_arg(expr, 0, arg0); + + return expr; + } + isl_ast_expr_free(arg0); + + if (isl_ast_expr_get_op_n_arg(expr) == 1) + return expr; + + n = isl_ast_expr_get_op_n_arg(expr); + res = isl_ast_expr_get_op_arg(expr, 1); + for (i = 1; i < array->n_index; ++i) { + isl_ast_expr *expr_i; + + expr_i = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i); + res = isl_ast_expr_mul(res, expr_i); + + if (i + 1 >= n) + continue; + expr_i = isl_ast_expr_get_op_arg(expr, i + 1); + res = isl_ast_expr_add(res, expr_i); + } + + if (1 + array->n_index > n) { + res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res); + } else { + list = isl_ast_expr_list_from_ast_expr(res); + res = isl_ast_expr_get_op_arg(expr, 0); + res = isl_ast_expr_access(res, list); + } + + isl_ast_expr_free(expr); + + return res; +} + +/* AST expression transformation callback for pet_stmt_build_ast_exprs. + * + * If the AST expression refers to an array that is not accessed + * at all, then this means the value of the expression is not used, + * so we might as well print zero (NULL pointer) instead. + * + * If the AST expression refers to a global scalar that is not + * a read-only scalar, then its address was passed to the kernel and + * we need to dereference it. + * + * If the AST expression refers to an access to a global array, + * then we linearize the access exploiting the bounds in data->local_array. + */ +static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr, + __isl_keep isl_id *id, void *user) +{ + struct ppcg_transform_data *data = user; + + if (!data->array) + return expr; + if (!data->array->accessed) { + isl_ctx *ctx; + + ctx = isl_ast_expr_get_ctx(expr); + isl_ast_expr_free(expr); + return isl_ast_expr_from_val(isl_val_zero(ctx)); + } + if (gpu_array_is_read_only_scalar(data->array)) + return expr; + if (!data->global) + return expr; + if (data->array->n_index == 0) + return dereference(expr); + if (!data->array->linearize) + return expr; + + return gpu_local_array_info_linearize_index(data->local_array, expr); +} + +/* This function is called for each instance of a user statement + * in the kernel "kernel", identified by "gpu_stmt". + * "kernel" may be NULL if we are not inside a kernel. + * + * We attach a struct ppcg_kernel_stmt to the "node", containing + * a computed AST expression for each access, through an annotation + * with name "user". + * These AST expressions are computed from iterator_map, + * which expresses the domain + * elements in terms of the generated loops, and sched2copy, + * which expresses the outer copy_schedule_dim dimensions of + * the kernel schedule computed by PPCG in terms of the generated loops. + */ +static __isl_give isl_ast_node *create_domain_leaf( + struct ppcg_kernel *kernel, __isl_take isl_ast_node *node, + __isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt, + struct gpu_gen *gen) +{ + struct ppcg_transform_data data; + struct ppcg_kernel_stmt *stmt; + isl_ctx *ctx; + isl_id *id; + isl_pw_multi_aff *sched2copy; + isl_map *map; + isl_pw_multi_aff *iterator_map; + isl_union_map *schedule; + + if (!node) + return NULL; + ctx = isl_ast_node_get_ctx(node); + + stmt = isl_calloc_type(ctx, struct ppcg_kernel_stmt); + if (!stmt) + return isl_ast_node_free(node); + + schedule = isl_ast_build_get_schedule(build); + map = isl_map_reverse(isl_map_from_union_map(schedule)); + iterator_map = isl_pw_multi_aff_from_map(map); + if (kernel) + sched2copy = compute_sched_to_copy(kernel, + isl_pw_multi_aff_copy(iterator_map)); + else + sched2copy = NULL; + + stmt->type = ppcg_kernel_domain; + stmt->u.d.stmt = gpu_stmt; + + data.kernel = kernel; + data.accesses = stmt->u.d.stmt->accesses; + data.iterator_map = iterator_map; + data.sched2copy = sched2copy; + stmt->u.d.ref2expr = gen->build_ast_expr(stmt->u.d.stmt->stmt, + build, &transform_index, &data, + &transform_expr, &data); + + isl_pw_multi_aff_free(iterator_map); + isl_pw_multi_aff_free(sched2copy); + + id = isl_id_alloc(ctx, "user", stmt); + id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); + return isl_ast_node_set_annotation(node, id); +} + +/* This function is called for each statement node in the AST + * for copying to or from shared/private memory. + * Attach a pointer to a ppcg_kernel_stmt representing the copy + * statement to the node. + * The statement name is "read" or "write", depending on whether we are + * reading from global memory or writing to global memory. + * + * The schedule is of the form + * + * type[D -> A] -> L + * + * where D corresponds to the outer tile->depth dimensions of + * the kernel schedule, A to the global array and L to the outer + * generated AST schedule. + * We compute the inverse and strip off the type, resulting in + * + * L -> [D -> A] + * + * We combine this mapping with on the one hand the projection + * + * [D -> A] -> A + * + * and on the other hand the group tiling + * + * [D -> A] -> T + * + * resulting in + * + * L -> A and L -> T + * + * and store the corresponding expressions in stmt->index and stmt->local_index, + * where stmt points to the ppcg_kernel_stmt that is attached to the node. + * stmt->index is linearized if the global memory array is linearized. + */ +static __isl_give isl_ast_node *create_access_leaf(struct ppcg_kernel *kernel, + struct gpu_array_ref_group *group, __isl_take isl_ast_node *node, + __isl_keep isl_ast_build *build) +{ + struct ppcg_kernel_stmt *stmt; + struct gpu_array_tile *tile; + isl_id *id; + isl_ast_expr *expr; + isl_space *space; + isl_map *access; + isl_pw_multi_aff *pma, *pma2; + const char *type; + + stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt); + if (!stmt) + return isl_ast_node_free(node); + + access = isl_map_from_union_map(isl_ast_build_get_schedule(build)); + type = isl_map_get_tuple_name(access, isl_dim_in); + stmt->u.c.read = !strcmp(type, "read"); + access = isl_map_reverse(access); + pma = isl_pw_multi_aff_from_map(access); + pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out); + + space = isl_space_range(isl_pw_multi_aff_get_space(pma)); + space = isl_space_unwrap(space); + pma2 = isl_pw_multi_aff_range_map(space); + pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, + isl_pw_multi_aff_copy(pma)); + expr = isl_ast_build_access_from_pw_multi_aff(build, pma2); + if (group->array->linearize) + expr = gpu_local_array_info_linearize_index(group->local_array, + expr); + stmt->u.c.index = expr; + + tile = gpu_array_ref_group_tile(group); + pma2 = isl_pw_multi_aff_from_multi_aff( + isl_multi_aff_copy(tile->tiling)); + pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, pma); + expr = isl_ast_build_access_from_pw_multi_aff(build, pma2); + stmt->u.c.local_index = expr; + + stmt->u.c.array = group->array; + stmt->u.c.local_array = group->local_array; + stmt->type = ppcg_kernel_copy; + + id = isl_id_alloc(kernel->ctx, "copy", stmt); + id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); + return isl_ast_node_set_annotation(node, id); +} + +/* Create a synchronization ppcg_kernel_stmt and + * attach it to the node "node" representing the synchronization. + */ +static __isl_give isl_ast_node *create_sync_leaf( + struct ppcg_kernel *kernel, __isl_take isl_ast_node *node, + __isl_keep isl_ast_build *build) +{ + struct ppcg_kernel_stmt *stmt; + isl_id *id; + + stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt); + if (!stmt) + return isl_ast_node_free(node); + + stmt->type = ppcg_kernel_sync; + id = isl_id_alloc(kernel->ctx, "sync", stmt); + id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); + return isl_ast_node_set_annotation(node, id); +} + +/* Build AST expressions for the device array sizes of all arrays in "prog" + * that require allocation on the device using "build", as well as + * for the original array sizes of all arrays that need to be declared + * on the host. + * "node" is freed in case of error. + */ +static __isl_give isl_ast_node *build_array_bounds( + __isl_take isl_ast_node *node, struct gpu_prog *prog, + __isl_keep isl_ast_build *build) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + isl_multi_pw_aff *size; + isl_ast_expr *expr; + + if (!gpu_array_requires_device_allocation(array)) + continue; + + size = isl_multi_pw_aff_copy(array->bound); + expr = ppcg_build_size_expr(size, build); + array->bound_expr = expr; + if (!expr) + return isl_ast_node_free(node); + } + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + isl_set *extent; + isl_multi_pw_aff *size; + isl_ast_expr *expr; + + if (!array->declare_local) + continue; + extent = isl_set_copy(array->declared_extent); + size = ppcg_size_from_extent(extent); + expr = ppcg_build_size_expr(size, build); + array->declared_size = expr; + if (!expr) + return isl_ast_node_free(node); + } + + return node; +} + +/* Internal data structure for at_domain. + * + * "prog" represents the entire scop. + * "kernel" points to the kernel to which the current schedule node + * belongs. It is set by before_mark and reset by after_mark. + * It may be NULL if we are outside any kernel. + */ +struct ppcg_at_domain_data { + struct gpu_prog *prog; + struct gpu_gen *gen; + struct ppcg_kernel *kernel; +}; + +/* This function is called for each instance of a user statement + * in the kernel. This may be one of the original user statements + * or a statement introduced by PPCG. + * + * We first check if the statement id corresponds to a gpu statement, + * which indicates the statement is an original user statement. Any statement + * that is not an original user statement has been introduced by PPCG and + * requires special handling. + * + * If the user statement is one of the original user statements, then we call + * create_domain_leaf. If it is "init_device", then we call + * build_array_bounds. Otherwise, we check if it is a copy or synchronization + * statement and call the appropriate functions. Statements that copy an array + * to/from the device do not need any further treatment. + * Neither does "clear_device". + */ +static __isl_give isl_ast_node *at_domain(__isl_take isl_ast_node *node, + __isl_keep isl_ast_build *build, void *user) +{ + struct ppcg_at_domain_data *data = user; + struct gpu_stmt *gpu_stmt; + isl_ast_expr *expr, *arg; + isl_id *id; + int is_sync; + const char *name; + void *p; + + expr = isl_ast_node_user_get_expr(node); + arg = isl_ast_expr_get_op_arg(expr, 0); + id = isl_ast_expr_get_id(arg); + name = isl_id_get_name(id); + p = isl_id_get_user(id); + isl_ast_expr_free(expr); + isl_ast_expr_free(arg); + + gpu_stmt = find_stmt(data->prog, id); + is_sync = gpu_tree_id_is_sync(id, data->kernel); + isl_id_free(id); + + if (gpu_stmt) + return create_domain_leaf(data->kernel, node, build, gpu_stmt, + data->gen); + + if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_")) + return node; + if (!strcmp(name, "init_device")) + return build_array_bounds(node, data->prog, build); + if (!strcmp(name, "clear_device")) + return node; + if (is_sync < 0) + return isl_ast_node_free(node); + if (!strcmp(name, "read") || !strcmp(name, "write")) { + struct gpu_array_ref_group *group = p; + return create_access_leaf(data->kernel, group, node, build); + } + if (!is_sync) + isl_die(data->prog->ctx, isl_error_internal, + "unknown statement type", + return isl_ast_node_free(node)); + return create_sync_leaf(data->kernel, node, build); +} + +/* Given a set of wrapped references "ref", return the corresponding + * access relations based on the tagged access relations "tagged". + * + * The elements of "ref" are of the form + * + * [D -> R] + * + * with D an iteration domains and R a reference. + * The elements of "tagged" are of the form + * + * [D -> R] -> A + * + * with A an array. + * + * Extend "tagged" to include the iteration domain in the range, i.e., + * + * [D -> R] -> [D -> A] + * + * apply the result to "ref" and then unwrap the resulting set + * to obtain relations of the form + * + * D -> A + */ +static __isl_give isl_union_map *wrapped_reference_to_access( + __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged) +{ + isl_union_map *tag2access; + + tag2access = isl_union_map_copy(tagged); + tag2access = isl_union_map_universe(tag2access); + tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access)); + tag2access = isl_union_map_domain_map(tag2access); + tag2access = isl_union_map_range_product(tag2access, tagged); + + ref = isl_union_set_coalesce(ref); + ref = isl_union_set_apply(ref, tag2access); + + return isl_union_set_unwrap(ref); +} + +/* Given an access relation "access" from one or more array reference groups, + * remove those reads if ("read" is 1) or writes (if "read" is 0) + * that are only needed to communicate data within + * the same iteration of "sched". + * The domain of "sched" corresponds to the original statement instances, + * i.e., those that appear in the domains of the access relations. + * "tagged" contains all tagged access relations to all + * the array reference groups accessed by "access" from statement + * instances scheduled by "sched". + * + * If the access is a read then it is either an element of + * + * live_in union (range flow) + * + * where live_in and flow may be overapproximations, or + * it reads an uninitialized value (that is not live-in because + * there is an intermediate kill) or it reads a value that was + * written within the same (compound) statement instance. + * If the access is a write then it is either an element of + * + * live_out union (domain flow) + * + * or it writes a value that is never read (and is not live-out + * because of an intermediate kill) or only + * within the same (compound) statement instance. + * In both cases, the access relation is also a subset of + * the group access relation. + * + * The cases where an uninitialized value is read or a value is written + * that is never read or where the dataflow occurs within a statement + * instance are also considered local and may also be removed. + * + * Essentially, we compute the intersection of "access" with either + * + * live_in union (range non-local-flow) + * + * or + * + * live_out union (domain non-local-flow) + * + * We first construct a relation "local" + * + * [[D -> R] -> [D' -> R']] + * + * of pairs of domain iterations accessing the reference group + * and references in the group that are coscheduled by "sched". + * + * If this relation does not intersect the dataflow dependences, + * then there is nothing we can possibly remove, unless the dataflow + * dependences themselves only relate a subset of the accesses. + * In particular, the accesses may not be involved in any dataflow + * dependences, either because they are uninitialized reads/dead writes + * or because the dataflow occurs inside a statement instance. + * + * Since the computation below may break up the access relation + * into smaller pieces, we only perform the intersection with + * the non-local dependent accesses if the local pairs + * intersect the dataflow dependences. Otherwise, we intersect + * with the universe of the non-local dependent accesses. + * This should at least remove accesses from statements that + * do not participate in any dependences. + * + * In particular, we remove the "local" dataflow dependences from + * the set of all dataflow dependences, or at least those + * that may contribute to a domain/range that intersects + * the domain of "access". + * Note that if the potential dataflow dependences are an overapproximation + * of the actual dataflow dependences, then the result remains an + * overapproximation of the non-local dataflow dependences. + * Copying to/from global memory is only needed for the references + * in the domain/range of the result or for accesses that are live out/in + * for the entire scop. + * + * We therefore map the domain/range of the "external" relation + * to the corresponding access relation and take the union with + * the live out/in relation. + */ +static __isl_give isl_union_map *remove_local_accesses( + struct gpu_prog *prog, __isl_take isl_union_map *tagged, + __isl_take isl_union_map *access, __isl_take isl_union_map *sched, + int read) +{ + int empty; + isl_union_pw_multi_aff *tagger; + isl_union_set *domain, *access_domain; + isl_union_map *local, *external, *universe; + isl_union_set *tag_set; + + if (isl_union_map_is_empty(access)) { + isl_union_map_free(sched); + isl_union_map_free(tagged); + return access; + } + + tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger); + domain = isl_union_map_domain(isl_union_map_copy(tagged)); + tagger = isl_union_pw_multi_aff_intersect_domain(tagger, + isl_union_set_copy(domain)); + sched = isl_union_map_preimage_domain_union_pw_multi_aff(sched, tagger); + + local = isl_union_map_apply_range(sched, + isl_union_map_reverse(isl_union_map_copy(sched))); + local = isl_union_map_intersect(local, + isl_union_map_copy(prog->scop->tagged_dep_flow)); + + empty = isl_union_map_is_empty(local); + + external = isl_union_map_copy(prog->scop->tagged_dep_flow); + universe = isl_union_map_universe(isl_union_map_copy(access)); + access_domain = isl_union_map_domain(universe); + domain = isl_union_set_universe(domain); + universe = isl_union_set_unwrap(domain); + universe = isl_union_map_intersect_domain(universe, access_domain); + domain = isl_union_map_wrap(universe); + if (read) + external = isl_union_map_intersect_range(external, domain); + else + external = isl_union_map_intersect_domain(external, domain); + external = isl_union_map_intersect_params(external, + isl_set_copy(prog->scop->context)); + external = isl_union_map_subtract(external, local); + + if (read) { + tag_set = isl_union_map_range(external); + external = wrapped_reference_to_access(tag_set, tagged); + external = isl_union_map_union(external, + isl_union_map_copy(prog->scop->live_in)); + } else { + tag_set = isl_union_map_domain(external); + external = wrapped_reference_to_access(tag_set, tagged); + external = isl_union_map_union(external, + isl_union_map_copy(prog->scop->live_out)); + } + + if (empty < 0) + external = isl_union_map_free(external); + else if (empty) + external = isl_union_map_universe(external); + + access = isl_union_map_intersect(access, external); + + return access; +} + +/* Given an access relation "access" from "group", remove those reads + * if ("read" is 1) or writes (if "read" is 0) that are only needed to + * communicate data within the same iteration of the schedule "prefix" + * at the position where the copying of the group is inserted. + * That is, the output dimension of "prefix" + * is equal to tile->depth. + * The domain of "prefix" corresponds to the original statement instances, + * i.e., those that appear in the domains of the access relations. + * + * Extract the tagged access relation of "group" and + * then call remove_local_accesses. + */ +static __isl_give isl_union_map *remove_local_accesses_group( + struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, + __isl_take isl_union_map *access, __isl_keep isl_union_map *prefix, + int read) +{ + isl_union_map *sched, *tagged; + + if (isl_union_map_is_empty(access)) + return access; + + tagged = group_tagged_access_relation(group); + sched = isl_union_map_copy(prefix); + + return remove_local_accesses(kernel->prog, tagged, access, sched, read); +} + +/* Build an access AST expression for the effective grid size using "build". + * Store the result in kernel->grid_size_expr. + */ +static isl_stat build_grid_size(struct ppcg_kernel *kernel, + __isl_keep isl_ast_build *build) +{ + isl_multi_pw_aff *size; + + size = isl_multi_pw_aff_copy(kernel->grid_size); + size = isl_multi_pw_aff_set_tuple_name(size, isl_dim_out, "grid"); + kernel->grid_size_expr = ppcg_build_size_expr(size, build); + + if (!kernel->grid_size_expr) + return isl_stat_error; + return isl_stat_ok; +} + +/* Build access AST expressions for the localized array sizes using "build". + * Store the result in local->bound_expr. + * Only do this for arrays for which localized bounds have been computed. + */ +static isl_stat build_local_array_sizes(struct ppcg_kernel *kernel, + __isl_keep isl_ast_build *build) +{ + int i; + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *local = &kernel->array[i]; + isl_multi_pw_aff *size; + + if (local->n_group == 0) + continue; + size = isl_multi_pw_aff_copy(local->bound); + local->bound_expr = ppcg_build_size_expr(size, build); + if (!local->bound_expr) + return isl_stat_error; + } + + return isl_stat_ok; +} + +/* Build access AST expressions for the effective grid size and + * the localized array sizes using "build". + */ +static isl_stat build_grid_and_local_array_sizes(struct ppcg_kernel *kernel, + __isl_keep isl_ast_build *build) +{ + if (build_grid_size(kernel, build) < 0) + return isl_stat_error; + if (build_local_array_sizes(kernel, build) < 0) + return isl_stat_error; + return isl_stat_ok; +} + +/* This function is called before the AST generator starts traversing + * the schedule subtree of a node with mark "mark". + * + * If the mark is called "kernel", store the kernel pointer in data->kernel + * for use in at_domain and build AST expressions for the grid size and + * the localized array sizes. + */ +static isl_stat before_mark(__isl_keep isl_id *mark, + __isl_keep isl_ast_build *build, void *user) +{ + struct ppcg_at_domain_data *data = user; + + if (!mark) + return isl_stat_error; + if (!strcmp(isl_id_get_name(mark), "kernel")) { + data->kernel = isl_id_get_user(mark); + if (build_grid_and_local_array_sizes(data->kernel, build) < 0) + return isl_stat_error; + } + return isl_stat_ok; +} + +/* This function is called after the AST generator has finished traversing + * the schedule subtree of a mark node. "node" points to the corresponding + * mark AST node. + * + * If the mark is called "kernel", then replace "node" by a user node + * that "calls" the kernel, representing the launch of the kernel. + * The original "node" is stored inside the kernel object so that + * it can be used to print the device code. + * Note that this assumes that a kernel is only launched once. + * Also clear data->kernel. + */ +static __isl_give isl_ast_node *after_mark(__isl_take isl_ast_node *node, + __isl_keep isl_ast_build *build, void *user) +{ + isl_ctx *ctx; + isl_id *id; + isl_ast_expr *expr; + isl_ast_expr_list *list; + struct ppcg_kernel *kernel; + struct ppcg_at_domain_data *data = user; + + ctx = isl_ast_node_get_ctx(node); + id = isl_ast_node_mark_get_id(node); + if (!id) + return isl_ast_node_free(node); + if (strcmp(isl_id_get_name(id), "kernel") || !data->kernel) { + isl_id_free(id); + return node; + } + kernel = data->kernel; + data->kernel = NULL; + kernel->space = isl_ast_build_get_schedule_space(build); + kernel->tree = isl_ast_node_mark_get_node(node); + isl_ast_node_free(node); + + expr = isl_ast_expr_from_id(isl_id_copy(id)); + list = isl_ast_expr_list_alloc(ctx, 0); + expr = isl_ast_expr_call(expr, list); + node = isl_ast_node_alloc_user(expr); + node = isl_ast_node_set_annotation(node, id); + + return node; +} + +static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user) +{ + int *depth = user; + int node_depth; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf) + return isl_bool_true; + node_depth = isl_schedule_node_get_schedule_depth(node); + if (node_depth > *depth) + *depth = node_depth; + + return isl_bool_false; +} + +/* Use isl to generate code for both the host and the device + * from "schedule". + * The device code is marked by "kernel" mark nodes in the schedule tree, + * containing a pointer to a ppcg_kernel object. + * The returned AST only contains the AST for the host code. + * The ASTs for the device code are embedded in ppcg_kernel objects + * attached to the leaf nodes that call "kernel". + */ +__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, + __isl_take isl_schedule *schedule) +{ + struct ppcg_at_domain_data data; + isl_ast_build *build; + isl_ast_node *tree; + isl_id_list *iterators; + int depth; + + data.prog = gen->prog; + data.gen = gen; + data.kernel = NULL; + + depth = 0; + if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth, + &depth) < 0) + return NULL; + build = isl_ast_build_alloc(gen->prog->ctx); + iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c"); + build = isl_ast_build_set_iterators(build, iterators); + build = isl_ast_build_set_at_each_domain(build, &at_domain, &data); + build = isl_ast_build_set_before_each_mark(build, &before_mark, &data); + build = isl_ast_build_set_after_each_mark(build, &after_mark, &data); + if (gen->prog->scop->options->debug->dump_final_schedule) + isl_schedule_dump(schedule); + tree = isl_ast_build_node_from_schedule(build, schedule); + isl_ast_build_free(build); + + return tree; +} + +__isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str) +{ + if (!str) + return NULL; + return isl_union_map_read_from_str(ctx, str); +} + +/* Can "node" be tiled and then mapped to block and thread identifiers? + * That is, is it permutable with at least one coincident dimension? + */ +static int is_permutable(__isl_keep isl_schedule_node *node) +{ + if (!node) + return -1; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return 0; + if (!isl_schedule_node_band_get_permutable(node)) + return 0; + if (isl_schedule_node_band_n_member(node) < 1) + return 0; + if (!isl_schedule_node_band_member_get_coincident(node, 0)) + return 0; + + return 1; +} + +/* A isl_schedule_foreach_schedule_node_top_down callback + * for setting *any_permutable and aborting the search + * if "node" is a permutable band with coincident dimensions. + * Otherwise, continue searching. + */ +static isl_bool set_permutable(__isl_keep isl_schedule_node *node, void *user) +{ + int *any_permutable = user; + int permutable; + + permutable = is_permutable(node); + if (permutable < 0) + return isl_bool_error; + if (!permutable) + return isl_bool_true; + + *any_permutable = 1; + + return isl_bool_error; +} + +/* Does the subtree rooted at "node" have any suitably permutable band nodes? + * That is, does it have any nodes that are permutable and that + * have a least one coincident dimension? + */ +static int subtree_has_permutable_bands(__isl_keep isl_schedule_node *node) +{ + int any_parallelism = 0; + + if (isl_schedule_node_foreach_descendant_top_down(node, &set_permutable, + &any_parallelism) < 0 && + !any_parallelism) + return -1; + + return any_parallelism; +} + +/* Does "schedule" contain any permutable band with at least one coincident + * member? + */ +int has_any_permutable_node(__isl_keep isl_schedule *schedule) +{ + isl_schedule_node *root; + int any_permutable; + + root = isl_schedule_get_root(schedule); + any_permutable = subtree_has_permutable_bands(root); + isl_schedule_node_free(root); + + return any_permutable; +} + +/* Is "node" a candidate for mapping to block and thread identifiers? + * In particular, is it permutable with at least one coincident dimension? + * Alternatively, does the subtree rooted at "node" not contain + * any such permutable node? Filter nodes are skipped in this case, + * because a band node will be inserted in front of the returned + * node and this is not possible for filter nodes that are children + * of set or sequence nodes. + */ +static int is_candidate(__isl_keep isl_schedule_node *node) +{ + int permutable; + + if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf) + return 1; + permutable = is_permutable(node); + if (permutable < 0 || permutable) + return permutable; + if (isl_schedule_node_get_type(node) == isl_schedule_node_filter) + return 0; + permutable = subtree_has_permutable_bands(node); + if (permutable < 0) + return -1; + return !permutable; +} + +/* Is "node" the outermost node in its branch that can be tiled + * and then mapped to block and thread identifiers? + * If there are no such nodes in the subtree at "node" and + * if "node" is not a filter node, then it is accepted too. + */ +static int is_outer_tilable(__isl_keep isl_schedule_node *node) +{ + int tilable; + isl_schedule_node *ancestor; + + tilable = is_candidate(node); + if (tilable < 0) + return -1; + if (!tilable) + return 0; + + tilable = 0; + ancestor = isl_schedule_node_copy(node); + while (isl_schedule_node_has_parent(ancestor)) { + ancestor = isl_schedule_node_parent(ancestor); + + tilable = is_candidate(ancestor); + if (tilable < 0 || tilable) + break; + } + + isl_schedule_node_free(ancestor); + return tilable < 0 ? -1 : !tilable; +} + +/* Collect the references to all writes in "group". + * Each reference is represented by a universe set in a space + * + * [S[i,j] -> R[]] + * + * with S[i,j] the statement instance space and R[] the array reference. + */ +static __isl_give isl_union_set *group_tagged_writes( + struct gpu_array_ref_group *group) +{ + int i; + isl_space *space; + isl_union_set *writes; + + space = isl_map_get_space(group->access); + writes = isl_union_set_empty(space); + for (i = 0; i < group->n_ref; ++i) { + isl_space *space; + isl_set *writes_i; + + if (!group->refs[i]->write) + continue; + + space = isl_map_get_space(group->refs[i]->tagged_access); + space = isl_space_domain(space); + writes_i = isl_set_universe(space); + writes = isl_union_set_add_set(writes, writes_i); + } + + return writes; +} + +/* Is there any write access in "group" that requires synchronization + * on a write to global memory? + * We currently take into account all writes that would require + * synchronization at the thread level depth, but if the copying + * for this group is performed at an outer level, then we do not + * actually need to take into account dependences at intermediate levels. + */ +static int any_sync_writes_in_group(struct ppcg_kernel *kernel, + struct gpu_array_ref_group *group) +{ + isl_union_set *writes; + int empty, disjoint; + + empty = isl_union_set_is_empty(kernel->sync_writes); + if (empty < 0) + return -1; + if (empty) + return 0; + + writes = group_tagged_writes(group); + disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes); + isl_union_set_free(writes); + + return disjoint < 0 ? -1 : !disjoint; +} + +/* Collect the references to all writes in "kernel" that write directly + * to global or shared memory, i.e., that are not mapped to private memory. + * Each reference is represented by a universe set in a space + * + * [S[i,j] -> R[]] + * + * with S[i,j] the statement instance space and R[] the array reference. + */ +static __isl_give isl_union_set *collect_non_private_tagged_writes( + struct ppcg_kernel *kernel) +{ + isl_union_set *writes; + int i, j; + + writes = isl_union_set_empty(isl_union_set_get_space(kernel->arrays)); + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *array = &kernel->array[i]; + + for (j = 0; j < array->n_group; ++j) { + struct gpu_array_ref_group *group = array->groups[j]; + enum ppcg_group_access_type type; + isl_union_set *writes_ij; + + if (!group->write) + continue; + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_private) + continue; + writes_ij = group_tagged_writes(group); + writes = isl_union_set_union(writes, writes_ij); + } + } + + return writes; +} + +/* Are there any direct writes to global memory that require + * synchronization? + */ +static int any_global_or_shared_sync_writes(struct ppcg_kernel *kernel) +{ + isl_union_set *writes; + int empty, disjoint; + + empty = isl_union_set_is_empty(kernel->sync_writes); + if (empty < 0) + return -1; + if (empty) + return 0; + + writes = collect_non_private_tagged_writes(kernel); + disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes); + isl_union_set_free(writes); + + return disjoint < 0 ? -1 : !disjoint; +} + +/* Construct an isl_multi_val for use as tile sizes for tiling "node" + * from the elements in "tile_size". + */ +static __isl_give isl_multi_val *construct_band_tiles_sizes( + __isl_keep isl_schedule_node *node, int *tile_size) +{ + isl_space *space; + + if (!node) + return NULL; + + space = isl_schedule_node_band_get_space(node); + return ppcg_multi_val_from_int_list(space, tile_size); +} + +/* Replace the partial schedule S of the band node "node" by + * + * floor(S/f) + * + * or + * + * f * floor(S/f) + * + * if scale_tile_loops is set, with f the integers in "factor". + * The list that "factor" points to is assumed to contain at least + * as many elements as the number of members in the band. + */ +static __isl_give isl_schedule_node *snap_band_to_sizes( + __isl_take isl_schedule_node *node, int *factor, + struct ppcg_options *options) +{ + isl_multi_val *mv; + + mv = construct_band_tiles_sizes(node, factor); + node = isl_schedule_node_band_scale_down(node, isl_multi_val_copy(mv)); + if (options->scale_tile_loops) + node = isl_schedule_node_band_scale(node, + isl_multi_val_copy(mv)); + isl_multi_val_free(mv); + + return node; +} + +/* Tile "band" with tile size specified by "sizes". + * + * Since the tile loops will be mapped to block ids, we forcibly + * turn off tile loop scaling. We may want to enable tile loop scaling + * at some later point, but then we would have to support the detection + * of strides during the mapping to block ids. + * Similarly, since the point loops will be mapped to thread ids, + * we forcibly shift the point loops so that they start at zero. + */ +static __isl_give isl_schedule_node *tile_band( + __isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes) +{ + isl_ctx *ctx = isl_schedule_node_get_ctx(node); + int scale_tile; + int shift_point; + + scale_tile = isl_options_get_tile_scale_tile_loops(ctx); + isl_options_set_tile_scale_tile_loops(ctx, 0); + shift_point = isl_options_get_tile_shift_point_loops(ctx); + isl_options_set_tile_shift_point_loops(ctx, 1); + + node = isl_schedule_node_band_tile(node, sizes); + + isl_options_set_tile_scale_tile_loops(ctx, scale_tile); + isl_options_set_tile_shift_point_loops(ctx, shift_point); + + return node; +} + +/* Extract the set of parameter values and outer schedule dimensions + * for which any statement instance + * in the kernel inserted at "node" needs to be executed. + * Intersect the set of parameter values derived from the host schedule + * relation with the context of "prog". + */ +static __isl_give isl_set *extract_context(__isl_keep isl_schedule_node *node, + struct gpu_prog *prog) +{ + isl_union_map *schedule; + isl_union_set *schedule_domain; + isl_set *context; + int empty; + + schedule = isl_schedule_node_get_prefix_schedule_relation(node); + schedule_domain = isl_union_map_range(schedule); + empty = isl_union_set_is_empty(schedule_domain); + if (empty < 0) { + isl_union_set_free(schedule_domain); + return NULL; + } + if (empty) { + int depth; + isl_space *space; + + space = isl_union_set_get_space(schedule_domain); + isl_union_set_free(schedule_domain); + space = isl_space_set_from_params(space); + depth = isl_schedule_node_get_schedule_depth(node); + space = isl_space_add_dims(space, isl_dim_set, depth); + context = isl_set_empty(space); + } else { + context = isl_set_from_union_set(schedule_domain); + } + context = isl_set_intersect_params(context, + isl_set_copy(prog->context)); + + return context; +} + +/* Return the set of outer array elements accessed by + * by the statement instances in "domain" in "prog". + * The instances in "domain" are those that appear + * in the domains of the access relations in "prog". + */ +static __isl_give isl_union_set *accessed_by_domain( + __isl_take isl_union_set *domain, struct gpu_prog *prog) +{ + isl_union_map *access; + isl_union_set *arrays; + + access = isl_union_map_union(isl_union_map_copy(prog->read), + isl_union_map_copy(prog->may_write)); + access = isl_union_map_intersect_domain(access, domain); + arrays = isl_union_map_range(access); + arrays = isl_union_set_apply(arrays, + isl_union_map_copy(prog->to_outer)); + + return arrays; +} + +/* Return the number of outer band members of the band node "node" + * that are marked coincident. + */ +static int n_outer_coincidence(__isl_keep isl_schedule_node *node) +{ + int i, n; + + n = isl_schedule_node_band_n_member(node); + + for (i = 0; i < n; ++i) + if (!isl_schedule_node_band_member_get_coincident(node, i)) + break; + + return i; +} + +/* If the band node "node" has more than "n" members, then split off + * the first "n" of them. + */ +static __isl_give isl_schedule_node *split_band( + __isl_take isl_schedule_node *node, int n) +{ + int dim; + + dim = isl_schedule_node_band_n_member(node); + if (n < dim) + node = isl_schedule_node_band_split(node, n); + + return node; +} + +/* Scale a band node that may have been split by split_band. + * "sizes" are the scaling factors for the original node. + * "node" either points to the original band node, or the outer + * of the two pieces after splitting. + * + * If the number of elements in "node" is smaller than the number of + * elements in "sizes", then some splitting has occurred and we split + * "sizes" in the same way. + */ +static __isl_give isl_schedule_node *scale_band( + __isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes) +{ + int n, dim; + + n = isl_multi_val_dim(sizes, isl_dim_set); + dim = isl_schedule_node_band_n_member(node); + if (n > dim) { + isl_multi_val *sizes2; + + sizes2 = isl_multi_val_copy(sizes); + sizes = isl_multi_val_drop_dims(sizes, + isl_dim_set, dim, n - dim); + sizes2 = isl_multi_val_drop_dims(sizes2, isl_dim_set, 0, dim); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_band_scale(node, sizes2); + node = isl_schedule_node_parent(node); + } + + return isl_schedule_node_band_scale(node, sizes); +} + +/* Return an isl_multi_aff, with as elements the parameters in "space" + * that have the names specified by the elements in "names". + * If (some of) these parameters do not already appear in "space", + * then they are added first. + */ +static __isl_give isl_multi_aff *parameter_vector(__isl_take isl_space *space, + __isl_keep isl_id_list *names) +{ + int i, n; + isl_local_space *ls; + isl_multi_aff *ma; + + if (!names) + space = isl_space_free(space); + + n = isl_id_list_n_id(names); + for (i = 0; i < n; ++i) { + int pos; + isl_id *id; + + id = isl_id_list_get_id(names, i); + pos = isl_space_find_dim_by_id(space, isl_dim_param, id); + if (pos >= 0) { + isl_id_free(id); + continue; + } + pos = isl_space_dim(space, isl_dim_param); + space = isl_space_add_dims(space, isl_dim_param, 1); + space = isl_space_set_dim_id(space, isl_dim_param, pos, id); + } + ma = isl_multi_aff_zero(isl_space_copy(space)); + ls = isl_local_space_from_space(isl_space_domain(space)); + for (i = 0; i < n; ++i) { + int pos; + isl_id *id; + isl_aff *aff; + + id = isl_id_list_get_id(names, i); + pos = isl_space_find_dim_by_id(space, isl_dim_param, id); + isl_id_free(id); + aff = isl_aff_var_on_domain(isl_local_space_copy(ls), + isl_dim_param, pos); + ma = isl_multi_aff_set_aff(ma, i, aff); + } + isl_local_space_free(ls); + + return ma; +} + +/* Return constraints on the domain elements that equate a sequence of + * parameters called "names", to the partial schedule + * of "node" modulo the integers in "size". + * The number of elements in the array "size" should be equal + * to the number of elements in "names". + * The number of members of the band node "node" should be smaller + * than or equal to this number. If it is smaller, then the first + * elements of "names" are equated to zero. + */ +static __isl_give isl_union_set *set_schedule_modulo( + __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names, + int *size) +{ + int n, n_zero; + isl_space *space; + isl_multi_aff *ma; + isl_multi_union_pw_aff *mupa, *mupa2; + isl_multi_val *mv; + isl_union_set *domain; + + if (!node) + return NULL; + n = isl_id_list_n_id(names); + if (n == 0) + return isl_schedule_node_get_universe_domain(node); + n_zero = n - isl_schedule_node_band_n_member(node); + + mupa = isl_schedule_node_band_get_partial_schedule(node); + mv = construct_band_tiles_sizes(node, size + n_zero); + mupa = isl_multi_union_pw_aff_mod_multi_val(mupa, mv); + + space = isl_multi_union_pw_aff_get_space(mupa); + space = isl_space_params(space); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, n_zero); + ma = isl_multi_aff_zero(space); + + domain = isl_schedule_node_get_universe_domain(node); + mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain( + isl_union_set_copy(domain), ma); + mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa); + + space = isl_multi_union_pw_aff_get_space(mupa); + ma = parameter_vector(space, names); + + mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma); + mupa = isl_multi_union_pw_aff_sub(mupa, mupa2); + + return isl_multi_union_pw_aff_zero_union_set(mupa); +} + +/* Insert a context node at "node" introducing the block and thread + * identifiers along with their bounds, which are stored in kernel->grid_size + * and kernel->block_dim. + * Note that the bounds on the block identifiers may implicitly impose + * constraints on the parameters. A guard needs to be inserted + * in the schedule tree to ensure that those bounds hold at "node". + * This guard is inserted in insert_guard. + */ +static __isl_give isl_schedule_node *insert_context(struct ppcg_kernel *kernel, + __isl_take isl_schedule_node *node) +{ + isl_set *context; + + context = isl_set_universe(isl_set_get_space(kernel->context)); + + context = add_bounded_parameters_dynamic(context, + kernel->grid_size, kernel->block_ids); + context = add_bounded_parameters(context, + kernel->block_dim, kernel->thread_ids); + + node = isl_schedule_node_insert_context(node, context); + + return node; +} + +/* Insert a guard that eliminates kernel launches where the kernel + * obviously does not have any work to do. + * + * In particular, eliminate kernel launches where there are obviously + * zero blocks. + * Use the same block size constraints that are used to create the context + * to ensure that all constraints implicit in the constructed context + * are imposed by the guard. + * + * Additionally, add other constraints that are valid + * for each executed instance ("context"), as long as this does not result + * in a disjunction. + */ +static __isl_give isl_schedule_node *insert_guard( + __isl_take isl_schedule_node *node, __isl_keep isl_set *context, + __isl_keep isl_multi_pw_aff *size, struct ppcg_scop *scop) +{ + unsigned nparam, n; + isl_set *guard; + isl_id_list *ids; + + guard = isl_set_copy(context); + guard = isl_set_compute_divs(guard); + guard = isl_set_from_basic_set(isl_set_simple_hull(guard)); + + nparam = isl_set_dim(guard, isl_dim_param); + n = isl_multi_pw_aff_dim(size, isl_dim_out); + ids = ppcg_scop_generate_names(scop, n, "__ppcg_tmp"); + guard = add_bounded_parameters_dynamic(guard, size, ids); + isl_id_list_free(ids); + guard = isl_set_project_out(guard, isl_dim_param, nparam, n); + + node = isl_schedule_node_insert_guard(node, guard); + + return node; +} + +/* Does any array reference group mapping require the band that is mapped + * to threads to be unrolled? + */ +static int kernel_requires_unroll(struct ppcg_kernel *kernel) +{ + int i, j; + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *array = &kernel->array[i]; + + for (j = 0; j < array->n_group; ++j) { + struct gpu_array_ref_group *group = array->groups[j]; + if (gpu_array_ref_group_requires_unroll(group)) + return 1; + } + } + + return 0; +} + +/* Mark the given band node "node" for unrolling by the AST generator and + * then sink it to the leaves of the schedule tree. + * All dimensions of "node" are assumed to be coincident, such that this + * sinking is a valid operation. + */ +static __isl_give isl_schedule_node *unroll(__isl_take isl_schedule_node *node) +{ + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); + + node = isl_schedule_node_band_sink(node); + + return node; +} + +/* Insert a synchronization node in the schedule tree of "node" + * after the core computation of "kernel" at the level of the band + * that is mapped to threads, except if that level is equal to + * that of the band that is mapped to blocks or if there are no writes + * to global or shared memory in the core computation that require + * synchronization. + * If there are any writes to shared memory and the shared memory + * copying is performed at the same level, then synchronization + * is needed between the core and the copying anyway, so we might + * as well add it here. If the copying is performed at a higher + * level, then different iterations of intermediate schedule dimensions + * may have a different mapping from between shared memory elements and + * threads, such that synchronization is required after the core. + * "node" is assumed to point to the kernel node. + * + * If the shared and the thread mark point to the same node, then make + * sure the synchronization is inserted outside of the shared mark. + */ +static __isl_give isl_schedule_node *add_sync(struct ppcg_kernel *kernel, + __isl_take isl_schedule_node *node) +{ + int depth; + int need_sync; + + need_sync = any_global_or_shared_sync_writes(kernel); + if (need_sync < 0) + return isl_schedule_node_free(node); + if (!need_sync) + return node; + + node = gpu_tree_move_down_to_thread(node, kernel->core); + depth = isl_schedule_node_get_schedule_depth(node); + node = gpu_tree_move_up_to_kernel(node); + if (depth == isl_schedule_node_get_schedule_depth(node)) + return node; + + node = gpu_tree_move_down_to_depth(node, depth, kernel->core); + node = gpu_tree_ensure_following_sync(node, kernel); + + node = gpu_tree_move_up_to_kernel(node); + + return node; +} + +/* Return a read ("read" is 1) or write access relation for "group" + * with those accesses removed that are only needed to communicate data + * within the subtree of the schedule rooted at "node". + * Furthermore, include the prefix schedule at "node". + * That is, return a relation of the form + * + * S -> [D -> A] + * + * with D the outer schedule dimensions at "node". + */ +static __isl_give isl_union_map *anchored_non_local_accesses( + struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, + __isl_take isl_schedule_node *node, int read) +{ + isl_union_map *access; + isl_union_map *prefix; + + prefix = isl_schedule_node_get_prefix_schedule_relation(node); + prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, + isl_union_pw_multi_aff_copy(kernel->contraction)); + access = gpu_array_ref_group_access_relation(group, read, !read); + access = remove_local_accesses_group(kernel, group, access, prefix, + read); + access = isl_union_map_range_product(prefix, access); + + return access; +} + +/* Given an array reference group "group", create a mapping + * + * read[D -> A] -> [D -> A] + * + * if "read" is set or + * + * write[D -> A] -> [D -> A] + * + * if "read" is not set. + * D corresponds to the outer tile->depth dimensions of + * the kernel schedule. + */ +static __isl_give isl_multi_aff *create_from_access(isl_ctx *ctx, + struct gpu_array_ref_group *group, int read) +{ + struct gpu_array_tile *tile; + isl_space *space; + isl_id *id; + + tile = gpu_array_ref_group_tile(group); + space = isl_space_copy(group->array->space); + space = isl_space_from_range(space); + space = isl_space_add_dims(space, isl_dim_in, tile->depth); + space = isl_space_wrap(space); + space = isl_space_map_from_set(space); + + id = isl_id_alloc(ctx, read ? "read" : "write", group); + space = isl_space_set_tuple_id(space, isl_dim_in, id); + + return isl_multi_aff_identity(space); +} + +/* If any writes in "group" require synchronization, then make sure + * that there is a synchronization node for "kernel" after the node + * following "node" in a sequence. + * + * If "shared" is set and no synchronization is needed for + * the writes to global memory, then add synchronization before + * the kernel to protect shared memory from being overwritten + * by the next iteration of the core computation. + * No additional synchronization is needed to protect against + * the next copy into shared memory because each element of + * the shared memory tile is always copied by the same thread. + */ +static __isl_give isl_schedule_node *add_group_write_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel, + struct gpu_array_ref_group *group, int shared) +{ + int need_sync; + + need_sync = any_sync_writes_in_group(kernel, group); + if (need_sync < 0) + return isl_schedule_node_free(node); + if (need_sync) { + node = isl_schedule_node_parent(node); + node = isl_schedule_node_next_sibling(node); + node = isl_schedule_node_child(node, 0); + node = gpu_tree_ensure_following_sync(node, kernel); + } else if (shared) { + struct gpu_array_tile *tile; + + tile = gpu_array_ref_group_tile(group); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_parent(node); + node = gpu_tree_move_down_to_depth(node, tile->depth, + kernel->core); + node = gpu_tree_move_left_to_sync(node, kernel); + } + + return node; +} + +/* Add copy statements to the schedule tree of "node" + * for reading from global memory to private memory (if "read" is set) or + * for writing back from private memory to global memory + * (if "read" is not set) for the array reference group "group" that + * is mapped to private memory. + * On input, "node" points to the kernel node, and it is moved + * back there on output. + * + * The copies are performed in the order of the array elements. + * The copy statement instances include a reference to the outer + * tile->depth dimensions of the kernel schedule for ease of + * combining them with the group tiling. + * + * That is, the extra schedule is of the form + * + * type[D -> A] -> A + * + * where D corresponds to the outer tile->depth dimensions of + * the kernel schedule and A to the global array. + * This schedule is unrolled because registers are not addressable. + * + * The copying is inserted in the schedule tree through an extension + * of the form + * + * D -> type[D -> A] + * + * where the extra domain elements type[D -> A] are those accessed + * by the group. + * A filter is inserted on type[D -> A] to ensure that the element + * is read/written by the same thread that needs the element. + * This filter is obtained by applying + * + * S -> type[D -> A] + * + * to the thread filter for the core statements. + * + * The extension is inserted before the core computation in case of a read + * and after the core computation in case of a write. + * In the latter case, we also make sure that there is a synchronization + * node after the write to global memory, unless this write is performed + * at the outer level of the kernel. + * In principle, this synchronization could be inserted higher + * in the schedule tree depending on where the corresponding reads + * from global memory are performed. + */ +static __isl_give isl_schedule_node *add_copies_group_private( + struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, + __isl_take isl_schedule_node *node, int read) +{ + struct gpu_array_tile *tile; + isl_union_map *access; + isl_union_set *domain; + isl_space *space; + isl_multi_aff *from_access; + isl_multi_pw_aff *mpa; + isl_multi_union_pw_aff *mupa; + isl_union_pw_multi_aff *contraction; + isl_schedule_node *graft; + isl_union_set *filter; + int kernel_depth; + int empty; + + kernel_depth = isl_schedule_node_get_schedule_depth(node); + tile = gpu_array_ref_group_tile(group); + node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core); + + access = anchored_non_local_accesses(kernel, group, node, read); + empty = isl_union_map_is_empty(access); + if (empty < 0 || empty) { + isl_union_map_free(access); + if (empty < 0) + return isl_schedule_node_free(node); + return gpu_tree_move_up_to_kernel(node); + } + + group->array->global = 1; + group->local_array->global = 1; + + from_access = create_from_access(kernel->ctx, group, read); + space = isl_space_domain(isl_multi_aff_get_space(from_access)); + access = isl_union_map_preimage_range_multi_aff(access, from_access); + + filter = isl_union_set_copy(kernel->thread_filter); + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + filter = isl_union_set_preimage_union_pw_multi_aff(filter, contraction); + filter = isl_union_set_apply(filter, isl_union_map_copy(access)); + filter = isl_union_set_detect_equalities(filter); + filter = isl_union_set_coalesce(filter); + + domain = isl_union_map_range(access); + access = isl_union_set_wrapped_domain_map(domain); + access = isl_union_map_reverse(access); + access = isl_union_map_coalesce(access); + graft = isl_schedule_node_from_extension(access); + + space = isl_space_map_from_set(space); + mpa = isl_multi_pw_aff_identity(space); + mpa = isl_multi_pw_aff_range_factor_range(mpa); + mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa); + + graft = isl_schedule_node_child(graft, 0); + graft = isl_schedule_node_insert_partial_schedule(graft, mupa); + graft = unroll(graft); + + graft = isl_schedule_node_insert_filter(graft, filter); + + graft = isl_schedule_node_parent(graft); + + if (read) + node = isl_schedule_node_graft_before(node, graft); + else { + node = isl_schedule_node_graft_after(node, graft); + if (kernel_depth < tile->depth) + node = add_group_write_sync(node, kernel, group, 0); + } + + node = gpu_tree_move_up_to_kernel(node); + + return node; +} + +/* Add copy statements to the schedule tree of "node" + * for reading from global memory to shared memory (if "read" is set) or + * for writing back from shared memory to global memory + * (if "read" is not set) for the array reference group "group" that + * is mapped to shared memory. + * On input, "node" points to the kernel node, and it is moved + * back there on output. + * + * The copies are performed in the order of the corresponding shared + * memory tile. + * The copy statement instances include a reference to the outer + * tile->depth dimensions of the kernel schedule for ease of + * combining them with the group tiling. + * + * If we are performing a read from global memory to shared memory and + * if the array involved is not a scalar, then we copy + * the entire tile to shared memory. This may result in some extra + * elements getting copied, but it should lead to simpler code + * (which means that fewer registers may be needed) and less divergence. + * + * Otherwise, we only copy the elements that will be read or have been written + * in the kernel. + * + * That is, the extra schedule is of the form + * + * type[D -> A] -> T + * + * where D corresponds to the outer tile->depth dimensions of + * the kernel schedule, A to the global array and T is the corresponding + * shared memory tile. + * + * The copying is inserted in the schedule tree through an extension + * of the form + * + * D -> type[D -> A] + * + * where the extra domain elements type[D -> A] are those accessed + * by the group. In the case of read from a non-scalar, this set + * is replaced by the entire shared memory tile. + * + * If the "unroll_copy_shared" option is set, then the AST generator + * is instructed to unroll the copying code. + * + * A filter is inserted on type[D -> A] to map the copy instances + * to the threads. In particular, the thread identifiers are + * equated to the position inside the shared memory tile (T) + * modulo the block size. + * We try to align the innermost tile dimension with the innermost + * thread identifier (x) as a heuristic to improve coalescing. + * In particular, if the dimension of the tile is greater than + * the dimension of the block, then the schedule mapping to the tile + * is broken up into two pieces and the filter is applied to the inner part. + * If, on the other hand, the dimension of the tile is smaller than + * the dimension of the block, then the initial thread identifiers + * are equated to zero and the remaining thread identifiers are + * matched to the memory tile. + * + * The extension is inserted before the core computation in case of a read + * and after the core computation in case of a write. + * In the case of a read, we first need to make sure there is some + * synchronization before the core computation such that we can put the read + * from global memory to shared memory before that synchronization. + * This ensures that all threads have finished copying into shared memory + * before the shared memory is used. + * We also need to make sure that there is a synchronization node after + * the core computation to ensure that the next load into shared memory + * only happens after all data has been used. There is no need for + * this synchronization if we are at the outer level since then there + * won't be a next load. + * In the case of a write, we need to make sure there is some synchronization + * after the core computation such taht we can put the write from shared + * memory to global memory after that synchronization. + * Unless we are at the outer level, we also need a synchronization node + * after the write to ensure the data is saved to global memory + * before the next iteration write to the same shared memory. + * It also makes sure the data has arrived in global memory before + * it is read in a subsequent iteration. + */ +static __isl_give isl_schedule_node *add_copies_group_shared( + struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, + __isl_take isl_schedule_node *node, int read) +{ + struct gpu_array_tile *tile; + isl_union_map *access; + isl_union_set *domain; + isl_multi_aff *ma; + isl_multi_aff *from_access; + isl_multi_pw_aff *mpa; + isl_multi_union_pw_aff *mupa; + isl_schedule_node *graft; + isl_union_set *filter; + int skip; + int kernel_depth; + int empty; + + tile = gpu_array_ref_group_tile(group); + kernel_depth = isl_schedule_node_get_schedule_depth(node); + node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core); + + access = anchored_non_local_accesses(kernel, group, node, read); + empty = isl_union_map_is_empty(access); + if (empty < 0 || empty) { + isl_union_map_free(access); + if (empty < 0) + return isl_schedule_node_free(node); + return gpu_tree_move_up_to_kernel(node); + } + + group->array->global = 1; + group->local_array->global = 1; + + from_access = create_from_access(kernel->ctx, group, read); + + ma = isl_multi_aff_copy(tile->tiling); + ma = isl_multi_aff_pullback_multi_aff(ma, + isl_multi_aff_copy(from_access)); + mpa = isl_multi_pw_aff_from_multi_aff(ma); + mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa); + + domain = isl_union_map_range(access); + + if (read && !gpu_array_is_scalar(group->array)) { + isl_map *map; + isl_union_set_free(domain); + map = group_tile(group); + domain = isl_union_set_from_set(isl_map_wrap(map)); + } + + domain = isl_union_set_preimage_multi_aff(domain, from_access); + access = isl_union_set_wrapped_domain_map(domain); + access = isl_union_map_reverse(access); + access = isl_union_map_coalesce(access); + graft = isl_schedule_node_from_extension(access); + + graft = isl_schedule_node_child(graft, 0); + + graft = isl_schedule_node_insert_partial_schedule(graft, mupa); + if (kernel->options->unroll_copy_shared) + graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_unroll); + + if (tile->n > kernel->n_block && kernel->n_block > 0) { + graft = isl_schedule_node_band_split(graft, + tile->n - kernel->n_block); + graft = isl_schedule_node_child(graft, 0); + } + if (tile->n < kernel->n_block) + skip = kernel->n_block - tile->n; + else + skip = 0; + filter = set_schedule_modulo(graft, kernel->thread_ids, + kernel->block_dim); + if (!kernel->options->wrap) + graft = snap_band_to_sizes(graft, kernel->block_dim + skip, + kernel->options); + if (tile->n > kernel->n_block && kernel->n_block > 0) + graft = isl_schedule_node_parent(graft); + graft = isl_schedule_node_insert_filter(graft, filter); + + while (graft && isl_schedule_node_has_parent(graft)) + graft = isl_schedule_node_parent(graft); + + if (read) { + if (kernel_depth < tile->depth) + node = gpu_tree_ensure_sync_after_core(node, kernel); + node = gpu_tree_move_left_to_sync(node, kernel); + node = isl_schedule_node_graft_before(node, graft); + } else { + node = gpu_tree_move_right_to_sync(node, kernel); + node = isl_schedule_node_graft_after(node, graft); + if (kernel_depth < tile->depth) + node = add_group_write_sync(node, kernel, group, 1); + } + + node = gpu_tree_move_up_to_kernel(node); + + return node; +} + +/* Check whether the array reference group "group" is mapped to + * private or shared memory and, if so, + * add copy statements to the schedule tree of "node" + * for reading from global memory to private or shared memory + * (if "read" is set) or for writing back from private or shared memory + * to global memory (if "read" is not set) for this group. + * On input, "node" points to the kernel node, and it is moved + * back there on output. + */ +static __isl_give isl_schedule_node *add_copies_group( + struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, + __isl_take isl_schedule_node *node, int read) +{ + enum ppcg_group_access_type type; + + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_private) + return add_copies_group_private(kernel, group, node, read); + if (type == ppcg_access_shared) + return add_copies_group_shared(kernel, group, node, read); + return node; +} + +/* For each array reference group that is mapped to private or shared memory, + * add copy statements to the schedule tree of "node" + * for reading from global memory to private or shared memory + * and for writing back. + * On input, "node" points to the kernel node, and it is moved + * back there on output. + */ +static __isl_give isl_schedule_node *add_copies(struct ppcg_kernel *kernel, + __isl_take isl_schedule_node *node) +{ + int i, j; + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *array = &kernel->array[i]; + + for (j = 0; j < array->n_group; ++j) { + struct gpu_array_ref_group *group = array->groups[j]; + + node = add_copies_group(kernel, group, node, 1); + if (!node) + return NULL; + node = add_copies_group(kernel, group, node, 0); + if (!node) + return NULL; + } + } + + return node; +} + +/* Mark all dimensions in the current band node atomic. + */ +static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node) +{ + return ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); +} + +/* Mark "node" atomic, if it is a band node. + * Do the same for all ancestors. + * Return a pointer to "node" (in the updated schedule tree). + */ +static __isl_give isl_schedule_node *atomic_ancestors( + __isl_take isl_schedule_node *node) +{ + int pos; + + if (!node) + return NULL; + if (!isl_schedule_node_has_parent(node)) + return node; + + pos = isl_schedule_node_get_child_position(node); + node = isl_schedule_node_parent(node); + if (isl_schedule_node_get_type(node) == isl_schedule_node_band) + node = atomic(node); + node = atomic_ancestors(node); + node = isl_schedule_node_child(node, pos); + + return node; +} + +/* Collect all write references that require synchronization. + * "node" is assumed to point to the kernel node. + * Each reference is represented by a universe set in a space + * + * [S[i,j] -> R[]] + * + * with S[i,j] the statement instance space and R[] the array reference. + * + * This function should be called before block and thread filters are added. + * + * Synchronization is needed after a write if there is a subsequent read + * within the same block that may not be performed by the same thread. + * There should not be any dependences between different blocks, + * so we start with the flow dependences within the same kernel invocation + * and we subtract from these those dependences that are mapped + * to the same iteration of the bands where synchronization is inserted. + * We do not remove pairs of instances that are known to map to + * the same thread across different iterations of the intermediate + * bands because the read may be performed by a different thread + * than the one that needs the value if shared memory is involved. + * + * We also consider all pairs of possible writes that access the same + * memory location and that may be mapped to the same block but not + * to the same iteration of the intermediate bands. + * In theory, it would be possible for one thread to still be in + * a previous iteration of a loop in these bands. + * A write to global memory in this delayed thread could then overwrite + * a write from another thread that has already moved on to + * the next iteration. + * + * After computing the above writes paired off with reads or writes + * that depend on them, we project onto the domain writes. + * Sychronization is needed after writes to global memory + * through these references. + */ +static __isl_give isl_union_set *compute_sync_writes( + struct ppcg_kernel *kernel, __isl_keep isl_schedule_node *node) +{ + isl_union_map *local; + isl_union_map *may_writes, *shared_access; + isl_union_map *kernel_prefix, *thread_prefix; + isl_union_map *equal; + isl_union_set *wrap; + isl_union_set *domain; + isl_union_pw_multi_aff *contraction; + + kernel_prefix = isl_schedule_node_get_prefix_schedule_union_map(node); + node = isl_schedule_node_copy(node); + node = gpu_tree_move_down_to_thread(node, kernel->core); + thread_prefix = isl_schedule_node_get_prefix_schedule_union_map(node); + isl_schedule_node_free(node); + + contraction = kernel->contraction; + kernel_prefix = isl_union_map_preimage_domain_union_pw_multi_aff( + kernel_prefix, isl_union_pw_multi_aff_copy(contraction)); + thread_prefix = isl_union_map_preimage_domain_union_pw_multi_aff( + thread_prefix, isl_union_pw_multi_aff_copy(contraction)); + domain = isl_union_set_copy(kernel->expanded_domain); + domain = isl_union_set_universe(domain); + + may_writes = isl_union_map_copy(kernel->prog->scop->tagged_may_writes); + may_writes = isl_union_map_curry(may_writes); + may_writes = isl_union_map_intersect_domain(may_writes, domain); + may_writes = isl_union_map_uncurry(may_writes); + shared_access = isl_union_map_copy(may_writes); + shared_access = isl_union_map_apply_range(shared_access, + isl_union_map_reverse(may_writes)); + + local = isl_union_map_copy(kernel->prog->scop->tagged_dep_flow); + local = isl_union_map_union(local, shared_access); + local = isl_union_map_zip(local); + + equal = isl_union_map_apply_range(kernel_prefix, + isl_union_map_reverse(isl_union_map_copy(kernel_prefix))); + wrap = isl_union_map_wrap(equal); + local = isl_union_map_intersect_domain(local, wrap); + equal = isl_union_map_apply_range(thread_prefix, + isl_union_map_reverse(isl_union_map_copy(thread_prefix))); + wrap = isl_union_map_wrap(equal); + local = isl_union_map_subtract_domain(local, wrap); + + local = isl_union_map_zip(local); + local = isl_union_map_universe(local); + + return isl_union_map_domain(local); +} + +/* Group the domain elements into a single space, named kernelX, + * with X the kernel sequence number "kernel_id". + */ +static __isl_give isl_schedule_node *group_statements( + __isl_take isl_schedule_node *node, int kernel_id) +{ + char buffer[20]; + isl_id *id; + + if (!node) + return NULL; + + snprintf(buffer, sizeof(buffer), "kernel%d", kernel_id); + id = isl_id_alloc(isl_schedule_node_get_ctx(node), buffer, NULL); + return isl_schedule_node_group(node, id); +} + +/* Create a ppcg_kernel representing the domain instances that reach "node" + * and insert a mark node pointing to the ppcg_kernel before "node". + * The band that "node" points to is the band that needs to be mapped + * to block identifiers. The band that needs to be mapped to thread + * identifiers should be marked by a "thread" mark by the caller. + * The linear branch between the current node and the "thread" mark + * may also have a "shared" mark. If present, the mapping to shared + * memory is computed at that point. + * Both marks are removed by this function. + * If "scale" is set, then the band that "node" points to is scaled + * by "sizes". + * + * Mark all outer band nodes as atomic to ensure each kernel is only + * scheduled once. + * If the domain elements that reach "node" live in more than one space, + * then group the domain elements into a single space, named kernelX, + * with X the kernel sequence number. + * + * Insert a guard node governing the kernel node to ensure that + * no kernels with zero blocks are launched. + * + * Insert a context node describing the block and thread + * identifiers inside the kernel mark. + * The context node needs to be inserted after the effective block size + * has been determined such that the bounds on the thread identifiers + * would reflect the effective block size. + * Insert a filter node inside the context node mapping the statement + * instances to block identifiers. In particular, the block identifiers + * are equated to the partial schedule of band that was marked for mapping + * to blocks modulo the grid size. + * Insert a filter node inside the "thread" mark mapping the statement + * instances to thread identifiers. In particular, the thread identifiers + * are equated to the partial schedule of band that was marked for mapping + * to threads modulo the block size. + * + * Compute array reference groups for all arrays, set the local + * array bounds based on the set of domain instances that reach + * the kernel node, check the total amount of shared memory used + * and compute all group tilings. + * The array reference groups are computed after the block filter + * has been inserted because it affects the mapping to shared or + * private memory. This computation also requires the thread filter + * (in the ppcg_kernel object), but this thread filter should not + * have been added to the schedule tree yet since the computation + * requires the schedule of the band that needs to be mapped to + * threads before the privatization is applied. + * + * If any array reference group requires the band mapped to threads + * to be unrolled, then we perform the required unrolling. + * + * We save a copy of the schedule that may influence the mappings + * to shared or private memory in kernel->copy_schedule. + * + * Finally, we add synchronization and copy statements to the schedule tree, + * remove the "thread" mark and create representations for the local + * variables in the kernel. + * + * We keep a copy of the isl_id that points to the kernel to ensure + * that the kernel does not get destroyed if the schedule node + * is freed due to some error condition. + */ +__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen, + __isl_take isl_schedule_node *node, int scale, + __isl_keep isl_multi_val *sizes) +{ + struct ppcg_kernel *kernel; + isl_id *id; + isl_schedule_node *node_thread; + isl_union_map *host_schedule; + isl_union_pw_multi_aff *contraction; + isl_set *host_domain; + isl_union_set *domain, *expanded; + int single_statement; + + node = gpu_tree_insert_shared_before_thread(node); + if (!node) + return NULL; + + kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel); + kernel = ppcg_kernel_create_local_arrays(kernel, gen->prog); + if (!kernel) + return isl_schedule_node_free(node); + + domain = isl_schedule_node_get_domain(node); + single_statement = isl_union_set_n_set(domain) == 1; + + kernel->ctx = gen->ctx; + kernel->prog = gen->prog; + kernel->options = gen->options; + kernel->context = extract_context(node, gen->prog); + kernel->core = isl_union_set_universe(isl_union_set_copy(domain)); + contraction = isl_schedule_node_get_subtree_contraction(node); + kernel->contraction = isl_union_pw_multi_aff_copy(contraction); + expanded = isl_union_set_copy(domain); + expanded = isl_union_set_preimage_union_pw_multi_aff(expanded, + contraction); + kernel->expanded_domain = isl_union_set_copy(expanded); + kernel->arrays = accessed_by_domain(expanded, gen->prog); + kernel->n_grid = n_outer_coincidence(node); + node_thread = isl_schedule_node_copy(node); + node_thread = gpu_tree_move_down_to_thread(node_thread, kernel->core); + node_thread = isl_schedule_node_child(node_thread, 0); + kernel->n_block = n_outer_coincidence(node_thread); + isl_schedule_node_free(node_thread); + kernel->id = gen->kernel_id++; + read_grid_and_block_sizes(kernel, gen); + + kernel->sync_writes = compute_sync_writes(kernel, node); + + host_schedule = isl_schedule_node_get_prefix_schedule_union_map(node); + host_domain = isl_set_from_union_set(isl_union_map_range( + host_schedule)); + + node = atomic_ancestors(node); + + id = isl_id_alloc(gen->ctx, "kernel", kernel); + id = isl_id_set_free_user(id, &ppcg_kernel_free_wrap); + node = isl_schedule_node_insert_mark(node, isl_id_copy(id)); + + if (!single_statement) + node = group_statements(node, kernel->id); + + node = isl_schedule_node_child(node, 0); + node = split_band(node, kernel->n_grid); + kernel->block_ids = ppcg_scop_generate_names(gen->prog->scop, + kernel->n_grid, "b"); + kernel->block_filter = set_schedule_modulo(node, kernel->block_ids, + kernel->grid_dim); + kernel->grid_size = extract_grid_size(kernel, + isl_union_set_copy(domain)); + if (!kernel->options->wrap) + node = snap_band_to_sizes(node, kernel->grid_dim, + kernel->options); + if (scale) + node = scale_band(node, isl_multi_val_copy(sizes)); + node = isl_schedule_node_parent(node); + if (!single_statement) + node = isl_schedule_node_parent(node); + node = insert_guard(node, kernel->context, kernel->grid_size, + gen->prog->scop); + node = gpu_tree_move_down_to_thread(node, kernel->core); + node = isl_schedule_node_child(node, 0); + node = split_band(node, kernel->n_block); + kernel->thread_ids = ppcg_scop_generate_names(gen->prog->scop, + kernel->n_block, "t"); + kernel->thread_filter = set_schedule_modulo(node, kernel->thread_ids, + kernel->block_dim); + if (extract_block_size(kernel, domain) < 0) + node = isl_schedule_node_free(node); + + node = gpu_tree_move_up_to_kernel(node); + node = isl_schedule_node_child(node, 0); + node = insert_context(kernel, node); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_insert_filter(node, + isl_union_set_copy(kernel->block_filter)); + + node = gpu_tree_move_up_to_kernel(node); + + if (gpu_group_references(kernel, node) < 0) + node = isl_schedule_node_free(node); + localize_bounds(kernel, host_domain); + isl_set_free(host_domain); + + check_shared_memory_bound(kernel); + mark_global_arrays(kernel); + compute_group_tilings(kernel); + + node = gpu_tree_move_down_to_thread(node, kernel->core); + node = isl_schedule_node_child(node, 0); + if (!kernel->options->wrap) + node = snap_band_to_sizes(node, kernel->block_dim, + kernel->options); + node = isl_schedule_node_insert_filter(node, + isl_union_set_copy(kernel->thread_filter)); + if (kernel_requires_unroll(kernel)) { + node = isl_schedule_node_child(node, 0); + node = unroll(node); + } + + node = gpu_tree_move_up_to_thread(node); + kernel->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node); + kernel->copy_schedule = + isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node); + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + kernel->copy_schedule = + isl_union_pw_multi_aff_pullback_union_pw_multi_aff( + kernel->copy_schedule, contraction); + + node = gpu_tree_move_up_to_kernel(node); + + node = add_sync(kernel, node); + node = add_copies(kernel, node); + + node = gpu_tree_move_down_to_shared(node, kernel->core); + node = isl_schedule_node_delete(node); + + node = gpu_tree_move_down_to_thread(node, kernel->core); + node = isl_schedule_node_delete(node); + + node = gpu_tree_move_up_to_kernel(node); + + if (create_kernel_vars(kernel) < 0) + node = isl_schedule_node_free(node); + + if (!single_statement) + node = isl_schedule_node_parent(node); + node = isl_schedule_node_parent(node); + + isl_id_free(id); + return node; +} + +/* Insert a zero-dimensional permutable band at "node". + */ +static __isl_give isl_schedule_node *insert_empty_permutable_band( + __isl_take isl_schedule_node *node) +{ + isl_space *space; + isl_schedule *schedule; + isl_union_set *domain; + isl_multi_union_pw_aff *mupa; + + schedule = isl_schedule_node_get_schedule(node); + domain = isl_schedule_get_domain(schedule); + space = isl_union_set_get_space(domain); + isl_union_set_free(domain); + isl_schedule_free(schedule); + + space = isl_space_set_from_params(space); + mupa = isl_multi_union_pw_aff_zero(space); + node = isl_schedule_node_insert_partial_schedule(node, mupa); + node = isl_schedule_node_band_set_permutable(node, 1); + + return node; +} + +/* See if hybrid tiling can be performed on "node" and its parent. + * If so, apply hybrid tiling and return the updated schedule tree. + * If not, return the original schedule tree. + * Return NULL on error. + * + * First check if "node", together with its parent, meets + * the basic requirements for hybrid tiling. + * If so, compute the relative dependence distances of "node" + * with respect to its parent and check if they are sufficiently bounded. + * If so, apply hybrid tiling using user specified tile sizes. + * + * The tile sizes are read before the dependence distance bounds are + * computed, because the user may have specified fewer dimensions + * than are available. In this case, the remaining schedule dimensions + * are split off and the dependence distances should be computed + * after these dimensions have been split off. + */ +static __isl_give isl_schedule_node *try_hybrid_tile(struct gpu_gen *gen, + __isl_take isl_schedule_node *node) +{ + int tile_len; + int *tile_size; + isl_bool ok; + isl_schedule_node *orig = node; + ppcg_ht_bounds *bounds; + + ok = ppcg_ht_parent_has_input_pattern(node); + if (ok < 0) + return isl_schedule_node_free(node); + if (!ok) + return orig; + + tile_len = 1 + isl_schedule_node_band_n_member(node); + tile_size = read_tile_sizes(gen, &tile_len); + if (!tile_size) + return isl_schedule_node_free(node); + + node = isl_schedule_node_copy(node); + node = split_band(node, tile_len - 1); + node = isl_schedule_node_parent(node); + bounds = ppcg_ht_compute_bounds(gen->prog->scop, node); + node = isl_schedule_node_child(node, 0); + + ok = ppcg_ht_bounds_is_valid(bounds); + if (ok >= 0 && ok) + node = gpu_hybrid_tile(gen, node, bounds, tile_size); + else + ppcg_ht_bounds_free(bounds); + free(tile_size); + + if (ok >= 0 && !ok) { + isl_schedule_node_free(node); + return orig; + } + isl_schedule_node_free(orig); + if (ok < 0) + return isl_schedule_node_free(node); + return node; +} + +/* If "node" is the outermost permutable band that can be mapped to block and + * thread identifiers in its branch (or the root of a subtree with + * no such outer bands), + * then mark the band as such, attaching a ppcg_kernel to the mark. + * + * If hybrid tiling is allowed, then first try and apply it + * to "node" and its parent. + * + * If "node" is the root of a subtree without permutable bands, + * then insert a zero-dimensional permutable band such that + * we can assume that "node" always points to a band node. + * This includes the case where "node" already points to a band node, + * but one without any coincident dimension. In this case, + * the extra node ensures that this original node does not get tiled. + * + * Tile "node" using user specified tile sizes, after splitting the band + * if the number of specified tile sizes is smaller than the dimension + * of the band. Mark the point band of this tiling as the band that + * needs to be mapped to threads and instruct the AST generator to unroll + * the band if the "unroll_gpu_tile" option is set. + * Create a kernel representing the domain instances that reach "node" and + * insert a mark node pointing to the ppcg_kernel before the band node. + */ +static __isl_give isl_schedule_node *mark_outer_permutable( + __isl_take isl_schedule_node *node, void *user) +{ + struct gpu_gen *gen = user; + int outer; + int scale; + int tile_len; + int *tile_size; + isl_id *id; + isl_multi_val *sizes; + + outer = is_outer_tilable(node); + if (outer < 0) + return isl_schedule_node_free(node); + if (!outer) + return node; + + if (gen->options->hybrid) { + isl_schedule_node *saved = isl_schedule_node_copy(node); + node = try_hybrid_tile(gen, node); + isl_schedule_node_free(saved); + if (node != saved) + return node; + } + + if (isl_schedule_node_get_type(node) != isl_schedule_node_band || + !isl_schedule_node_band_member_get_coincident(node, 0)) + node = insert_empty_permutable_band(node); + + tile_len = isl_schedule_node_band_n_member(node); + tile_size = read_tile_sizes(gen, &tile_len); + if (!tile_size) + return isl_schedule_node_free(node); + if (tile_len < isl_schedule_node_band_n_member(node)) + node = isl_schedule_node_band_split(node, tile_len); + sizes = construct_band_tiles_sizes(node, tile_size); + node = tile_band(node, isl_multi_val_copy(sizes)); + node = isl_schedule_node_child(node, 0); + if (gen->options->unroll_gpu_tile) + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); + id = isl_id_alloc(gen->ctx, "thread", NULL); + node = isl_schedule_node_insert_mark(node, id); + node = isl_schedule_node_parent(node); + + scale = gen->options->scale_tile_loops; + node = gpu_create_kernel(gen, node, scale, sizes); + isl_multi_val_free(sizes); + free(tile_size); + + return node; +} + +/* Given a set or sequence node, return the union the filters of either all + * (if "only_initial" is not set) or the initial (if "only_initial" is set) + * direct subtrees that do not contain any suitably permutable bands + * (according to subtree_has_permutable_bands). + */ +static __isl_give isl_union_set *get_non_parallel_subtree_filters( + __isl_keep isl_schedule_node *node, int only_initial) +{ + isl_space *space; + isl_union_set *filter; + int i, n; + + n = isl_schedule_node_n_children(node); + if (n < 0) + return NULL; + + node = isl_schedule_node_copy(node); + node = isl_schedule_node_child(node, 0); + filter = isl_schedule_node_filter_get_filter(node); + node = isl_schedule_node_parent(node); + space = isl_union_set_get_space(filter); + isl_union_set_free(filter); + filter = isl_union_set_empty(space); + + for (i = 0; i < n; ++i) { + int parallelism; + + node = isl_schedule_node_child(node, i); + parallelism = subtree_has_permutable_bands(node); + if (parallelism < 0) { + filter = isl_union_set_free(filter); + } else if (!parallelism) { + isl_union_set *filter_i; + filter_i = isl_schedule_node_filter_get_filter(node); + filter = isl_union_set_union(filter, filter_i); + } else if (only_initial) + break; + node = isl_schedule_node_parent(node); + } + + isl_schedule_node_free(node); + + return filter; +} + +/* Given a set or sequence node, return the union of the filters of + * the direct subtrees that do not contain any suitably permutable bands + * (according to subtree_has_permutable_bands). + */ +static __isl_give isl_union_set *get_all_non_parallel_subtree_filters( + __isl_keep isl_schedule_node *node) +{ + return get_non_parallel_subtree_filters(node, 0); +} + +/* Given a set or sequence node, return the union of the filters of + * the initial direct subtrees that do not contain any suitably permutable + * bands (according to subtree_has_permutable_bands). + */ +static __isl_give isl_union_set *get_initial_non_parallel_subtree_filters( + __isl_keep isl_schedule_node *node) +{ + return get_non_parallel_subtree_filters(node, 1); +} + +/* Mark all variables that are accessed by the statement instances in "domain" + * and that are local to "prog" as requiring a declaration in the host code. + * The statement instances in "domain" correspond to (a subset of) + * the active instances at "node". + * "node" is not modified by this function, except that NULL is returned + * in case of error. + */ +static __isl_give isl_schedule_node *declare_accessed_local_variables( + __isl_take isl_schedule_node *node, struct gpu_prog *prog, + __isl_keep isl_union_set *domain) +{ + isl_union_pw_multi_aff *contraction; + isl_union_set *arrays; + int i; + + if (!ppcg_scop_any_hidden_declarations(prog->scop)) + return node; + contraction = isl_schedule_node_get_subtree_contraction(node); + domain = isl_union_set_copy(domain); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction); + arrays = accessed_by_domain(domain, prog); + + for (i = 0; i < prog->n_array; ++i) { + isl_space *space; + isl_set *set; + int empty; + + if (!prog->array[i].local) + continue; + space = isl_set_get_space(prog->array[i].extent); + set = isl_union_set_extract_set(arrays, space); + empty = isl_set_plain_is_empty(set); + isl_set_free(set); + if (empty < 0) + goto error; + if (!empty) + prog->array[i].declare_local = 1; + } + + isl_union_set_free(arrays); + return node; +error: + isl_union_set_free(arrays); + return isl_schedule_node_free(node); +} + +/* If "node" points to a set node, then separate its children + * into subtrees that have suitably permutable bands and + * those that do not. + * Adjust the schedule tree in order to execute the second group + * after the first group and return a pointer to the first group, + * assuming there are any such subtrees. + * If "node" points to a sequence node, then separate the initial + * children that do not have suitably permutable bands and + * return a pointer to the subsequence of children that do have such bands, + * assuming there are any such subtrees. + * + * In both cases, mark all local variables in "prog" that are accessed by + * the group without permutable bands as requiring a declaration on the host. + */ +static __isl_give isl_schedule_node *isolate_permutable_subtrees( + __isl_take isl_schedule_node *node, struct gpu_prog *prog) +{ + isl_union_set *filter; + enum isl_schedule_node_type type; + + if (!node) + return NULL; + type = isl_schedule_node_get_type(node); + if (type == isl_schedule_node_set) { + filter = get_all_non_parallel_subtree_filters(node); + node = declare_accessed_local_variables(node, prog, filter); + node = isl_schedule_node_order_after(node, filter); + } else if (type == isl_schedule_node_sequence) { + filter = get_initial_non_parallel_subtree_filters(node); + node = declare_accessed_local_variables(node, prog, filter); + node = isl_schedule_node_order_before(node, filter); + } + + return node; +} + +/* Replace any reference to an array element in the range of "copy" + * by a reference to all array elements (defined by the extent of the array). + */ +static __isl_give isl_union_map *approximate_copy_out( + __isl_take isl_union_map *copy, struct gpu_prog *prog) +{ + int i; + isl_union_map *res; + + res = isl_union_map_empty(isl_union_map_get_space(copy)); + + for (i = 0; i < prog->n_array; ++i) { + isl_space *space; + isl_set *set; + isl_union_map *copy_i; + isl_union_set *extent, *domain; + + space = isl_space_copy(prog->array[i].space); + extent = isl_union_set_from_set(isl_set_universe(space)); + copy_i = isl_union_map_copy(copy); + copy_i = isl_union_map_intersect_range(copy_i, extent); + set = isl_set_copy(prog->array[i].extent); + extent = isl_union_set_from_set(set); + domain = isl_union_map_domain(copy_i); + copy_i = isl_union_map_from_domain_and_range(domain, extent); + res = isl_union_map_union(res, copy_i); + } + + isl_union_map_free(copy); + + return res; +} + +/* Insert "kernel" marks that point to a ppcg_kernel structure + * in front of all outermost tilable band that (by construction) + * have at least one parallel loop. + */ +static __isl_give isl_schedule_node *mark_kernels(struct gpu_gen *gen, + __isl_take isl_schedule_node *node) +{ + return isl_schedule_node_map_descendant_bottom_up(node, + &mark_outer_permutable, gen); +} + +/* Construct schedule constraints from the dependences in prog->scop and + * the array order dependences in prog->array_order. + * + * If live range reordering is allowed, then we need to make sure + * that live ranges on arrays are not run in parallel since doing + * so would require array expansion. We therefore add the array + * order dependences to the coincidence dependences. Non-zero array + * order dependences will then prevent a schedule dimension from being + * considered parallel. + * Live ranges derived from scalars are allowed to be run in parallel + * since we force the scalars to be mapped to private memory in + * check_scalar_live_ranges. + * If live range reordering is allowed, then the false dependences + * are not added to the validity constraints as that would prevent + * reordering. Instead, the external false dependences that enforce that reads + * from potentially live-in data precede any later write and + * that writes of potentially live-out data follow any other earlier write + * are added to the validity and the coincidence constraints. + * The false dependences are still added to the proximity constraints + * for consistency with the case where live range reordering is not allowed. + * The coincidence constraints then consist of flow dependences, + * external false dependences and array order dependences. + * The independences can be filtered out from the first two sets. + * They have already been filtered out from the array order dependences + * on a per array basis in collect_order_dependences. + * There is no need for a per array handling of the other two sets + * as there should be no flow or external false dependence on local + * variables that can be filtered out. + */ +static __isl_give isl_schedule_constraints *construct_schedule_constraints( + struct gpu_prog *prog) +{ + isl_union_set *domain; + isl_union_map *dep_raw, *dep; + isl_union_map *validity, *proximity, *coincidence; + isl_schedule_constraints *sc; + + domain = isl_union_set_copy(prog->scop->domain); + sc = isl_schedule_constraints_on_domain(domain); + sc = isl_schedule_constraints_set_context(sc, + isl_set_copy(prog->scop->context)); + if (prog->scop->options->live_range_reordering) { + sc = isl_schedule_constraints_set_conditional_validity(sc, + isl_union_map_copy(prog->scop->tagged_dep_flow), + isl_union_map_copy(prog->scop->tagged_dep_order)); + proximity = isl_union_map_copy(prog->scop->dep_flow); + validity = isl_union_map_copy(proximity); + validity = isl_union_map_union(validity, + isl_union_map_copy(prog->scop->dep_forced)); + proximity = isl_union_map_union(proximity, + isl_union_map_copy(prog->scop->dep_false)); + coincidence = isl_union_map_copy(validity); + coincidence = isl_union_map_subtract(coincidence, + isl_union_map_copy(prog->scop->independence)); + coincidence = isl_union_map_union(coincidence, + isl_union_map_copy(prog->array_order)); + } else { + dep_raw = isl_union_map_copy(prog->scop->dep_flow); + dep = isl_union_map_copy(prog->scop->dep_false); + dep = isl_union_map_union(dep, dep_raw); + dep = isl_union_map_coalesce(dep); + proximity = isl_union_map_copy(dep); + coincidence = isl_union_map_copy(dep); + validity = dep; + } + sc = isl_schedule_constraints_set_validity(sc, validity); + sc = isl_schedule_constraints_set_coincidence(sc, coincidence); + sc = isl_schedule_constraints_set_proximity(sc, proximity); + + if (prog->scop->options->debug->dump_schedule_constraints) + isl_schedule_constraints_dump(sc); + return sc; +} + +/* Compute an appropriate schedule based on the accesses in + * gen->read and gen->write. + * + * We derive schedule constraints from the dependences in gen->prog->scop + * and then use isl to compute a schedule that has a parallel loop + * in each tilable band. + * During the schedule construction, some statement instances + * may be grouped first based on the input schedule. + */ +static __isl_give isl_schedule *compute_schedule(struct gpu_gen *gen) +{ + isl_schedule_constraints *sc; + isl_schedule *schedule; + + sc = construct_schedule_constraints(gen->prog); + schedule = gen->prog->scop->schedule; + schedule = ppcg_compute_schedule(sc, schedule, gen->options); + + return schedule; +} + +/* If the band node "node" has exactly one member then mark it permutable. + */ +static __isl_give isl_schedule_node *band_set_permutable( + __isl_take isl_schedule_node *node, + __isl_keep isl_schedule_constraints *sc) +{ + if (isl_schedule_node_band_n_member(node) == 1) + node = isl_schedule_node_band_set_permutable(node, 1); + + return node; +} + +/* Return the coincidence constraints between pairs of instances + * that are scheduled together by the ancestors of "node". + * That is, select those coincidence constraints that relate + * pairs of instances that have the same value for the prefix schedule. + * If the schedule depth is zero, then the prefix schedule does not + * contain any information, so we intersect domain and range + * of the schedule constraints with the reaching domain elements instead. + */ +static __isl_give isl_union_map *get_local_coincidence( + __isl_keep isl_schedule_node *node, + __isl_keep isl_schedule_constraints *sc) +{ + isl_union_map *coincidence; + isl_multi_union_pw_aff *prefix; + isl_union_pw_multi_aff *contraction; + + coincidence = isl_schedule_constraints_get_coincidence(sc); + contraction = isl_schedule_node_get_subtree_contraction(node); + if (isl_schedule_node_get_schedule_depth(node) == 0) { + isl_union_set *domain; + + domain = isl_schedule_node_get_domain(node); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, + contraction); + coincidence = isl_union_map_intersect_domain(coincidence, + isl_union_set_copy(domain)); + coincidence = isl_union_map_intersect_range(coincidence, + domain); + return coincidence; + } + + prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); + prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix, + contraction); + return isl_union_map_eq_at_multi_union_pw_aff(coincidence, prefix); +} + +/* For each member in the band node "node", determine whether + * it is coincident with respect to the outer nodes and mark + * it accordingly. + * + * That is, for each coincidence constraint between pairs + * of instances that are scheduled together by the outer nodes, + * check that domain and range are assigned the same value + * by the band member. This test is performed by checking + * that imposing the same value for the band member does not + * remove any elements from the set of coincidence constraints. + */ +static __isl_give isl_schedule_node *band_set_coincident( + __isl_take isl_schedule_node *node, + __isl_keep isl_schedule_constraints *sc) +{ + isl_union_map *coincidence; + isl_union_pw_multi_aff *contraction; + isl_multi_union_pw_aff *partial; + int i, n; + + coincidence = get_local_coincidence(node, sc); + + partial = isl_schedule_node_band_get_partial_schedule(node); + contraction = isl_schedule_node_get_subtree_contraction(node); + partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial, + contraction); + n = isl_schedule_node_band_n_member(node); + for (i = 0; i < n; ++i) { + isl_union_map *coincidence_i; + isl_union_pw_aff *upa; + isl_multi_union_pw_aff *partial_i; + int subset; + + upa = isl_multi_union_pw_aff_get_union_pw_aff(partial, i); + partial_i = isl_multi_union_pw_aff_from_union_pw_aff(upa); + coincidence_i = isl_union_map_copy(coincidence); + coincidence_i = isl_union_map_eq_at_multi_union_pw_aff( + coincidence_i, partial_i); + subset = isl_union_map_is_subset(coincidence, coincidence_i); + isl_union_map_free(coincidence_i); + + if (subset < 0) + break; + node = isl_schedule_node_band_member_set_coincident(node, i, + subset); + } + if (i < n) + node = isl_schedule_node_free(node); + isl_multi_union_pw_aff_free(partial); + isl_union_map_free(coincidence); + + return node; +} + +/* If "node" is a band, then set its properties. + * + * In particular, if the band has exactly one member, then mark it permutable. + * Mark the band member coincident based on the coincidence constraints + * of "sc". + */ +static __isl_give isl_schedule_node *set_band_properties( + __isl_take isl_schedule_node *node, void *user) +{ + isl_schedule_constraints *sc = user; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return node; + if (isl_schedule_node_band_n_member(node) == 0) + return node; + + node = band_set_permutable(node, sc); + node = band_set_coincident(node, sc); + + return node; +} + +/* Return the original schedule with all bands marked permutable and + * all band members marked coincident based on the coincidence constraints. + * The bands are explicitly marked permutable so that they will be considered + * by mark_outer_permutable. + */ +static __isl_give isl_schedule *determine_properties_original_schedule( + struct gpu_gen *gen) +{ + isl_schedule *schedule; + isl_schedule_constraints *sc; + + schedule = isl_schedule_copy(gen->prog->scop->schedule); + sc = construct_schedule_constraints(gen->prog); + schedule = isl_schedule_map_schedule_node_bottom_up(schedule, + &set_band_properties, sc); + isl_schedule_constraints_free(sc); + + return schedule; +} + +/* Compute a schedule or determine the properties of the original schedule + * depending on the value of the "reschedule" option. + */ +static __isl_give isl_schedule *compute_or_set_properties(void *user) +{ + struct gpu_gen *gen = user; + + if (gen->options->reschedule) + return compute_schedule(gen); + else + return determine_properties_original_schedule(gen); +} + +/* Obtain a schedule for the scop, by reading it from + * a file, by computing one or by determining the properties + * of the original schedule. + */ +__isl_give isl_schedule *get_schedule(struct gpu_gen *gen) +{ + return ppcg_get_schedule(gen->ctx, gen->options, + &compute_or_set_properties, gen); +} + +/* Construct the string "_". + */ +static char *concat(isl_ctx *ctx, const char *a, const char *b) +{ + isl_printer *p; + char *s; + + p = isl_printer_to_str(ctx); + p = isl_printer_print_str(p, a); + p = isl_printer_print_str(p, "_"); + p = isl_printer_print_str(p, b); + s = isl_printer_get_str(p); + isl_printer_free(p); + + return s; +} + +/* For each array in "prog" of which an element appears in "accessed" and + * that is not a read only scalar, create a zero-dimensional universe set + * of which the tuple id has name "_" and a user + * pointer pointing to the array (gpu_array_info). + * + * If the array is local to "prog", then make sure it will be declared + * in the host code. + * + * Return the list of these universe sets. + */ +static __isl_give isl_union_set_list *create_copy_filters(struct gpu_prog *prog, + const char *prefix, __isl_take isl_union_set *accessed) +{ + int i; + isl_ctx *ctx; + isl_union_set_list *filters; + + ctx = prog->ctx; + filters = isl_union_set_list_alloc(ctx, 0); + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + isl_space *space; + isl_set *accessed_i; + int empty; + char *name; + isl_id *id; + isl_union_set *uset; + + if (gpu_array_is_read_only_scalar(array)) + continue; + + space = isl_space_copy(array->space); + accessed_i = isl_union_set_extract_set(accessed, space); + empty = isl_set_plain_is_empty(accessed_i); + isl_set_free(accessed_i); + if (empty < 0) { + filters = isl_union_set_list_free(filters); + break; + } + if (empty) + continue; + + array->global = 1; + if (array->local) + array->declare_local = 1; + + name = concat(ctx, prefix, array->name); + id = name ? isl_id_alloc(ctx, name, array) : NULL; + free(name); + space = isl_space_set_alloc(ctx, 0, 0); + space = isl_space_set_tuple_id(space, isl_dim_set, id); + uset = isl_union_set_from_set(isl_set_universe(space)); + + filters = isl_union_set_list_add(filters, uset); + } + isl_union_set_free(accessed); + + return filters; +} + +/* Make sure that code for the statements in "filters" that + * copy arrays to or from the device is only generated when + * the size of the corresponding array is positive. + * That is, add a set node underneath "graft" with "filters" as children + * and for each child add a guard that the selects the parameter + * values for which the corresponding array has a positive size. + * The array is available in the user pointer of the statement identifier. + * "depth" is the schedule depth of the position where "graft" + * will be added. + */ +static __isl_give isl_schedule_node *insert_positive_size_guards( + __isl_take isl_schedule_node *graft, + __isl_take isl_union_set_list *filters, int depth) +{ + int i, n; + + graft = isl_schedule_node_child(graft, 0); + graft = isl_schedule_node_insert_set(graft, filters); + n = isl_schedule_node_n_children(graft); + for (i = 0; i < n; ++i) { + isl_union_set *filter; + isl_set *domain, *guard; + isl_id *id; + struct gpu_array_info *array; + + graft = isl_schedule_node_child(graft, i); + filter = isl_schedule_node_filter_get_filter(graft); + domain = isl_set_from_union_set(filter); + id = isl_set_get_tuple_id(domain); + array = isl_id_get_user(id); + isl_id_free(id); + isl_set_free(domain); + guard = gpu_array_positive_size_guard(array); + guard = isl_set_from_params(guard); + guard = isl_set_add_dims(guard, isl_dim_set, depth); + graft = isl_schedule_node_child(graft, 0); + graft = isl_schedule_node_insert_guard(graft, guard); + graft = isl_schedule_node_parent(graft); + graft = isl_schedule_node_parent(graft); + } + graft = isl_schedule_node_parent(graft); + + return graft; +} + +/* Create a graft for copying arrays to or from the device, + * whenever the size of the array is strictly positive. + * Each statement is called "_" and + * the identifier has a user pointer pointing to the array. + * The graft will be added at the position specified by "node". + * "copy" contains the array elements that need to be copied. + * Only arrays of which some elements need to be copied + * will have a corresponding statement in the graph. + * Note though that each such statement will copy the entire array. + */ +static __isl_give isl_schedule_node *create_copy_device(struct gpu_prog *prog, + __isl_keep isl_schedule_node *node, const char *prefix, + __isl_take isl_union_set *copy) +{ + int depth; + isl_ctx *ctx; + isl_space *space; + isl_union_set *all, *domain; + isl_union_set_list *filters; + isl_union_map *extension; + isl_schedule_node *graft; + + ctx = prog->ctx; + depth = isl_schedule_node_get_schedule_depth(node); + filters = create_copy_filters(prog, prefix, copy); + all = isl_union_set_list_union(isl_union_set_list_copy(filters)); + + space = depth < 0 ? NULL : isl_space_set_alloc(ctx, 0, depth); + domain = isl_union_set_from_set(isl_set_universe(space)); + extension = isl_union_map_from_domain_and_range(domain, all); + graft = isl_schedule_node_from_extension(extension); + + if (!filters) + return isl_schedule_node_free(graft); + if (isl_union_set_list_n_union_set(filters) == 0) { + isl_union_set_list_free(filters); + return graft; + } + + return insert_positive_size_guards(graft, filters, depth); +} + +/* Return (the universe spaces of) the arrays that are declared + * inside the scop corresponding to "prog" and for which all + * potential writes inside the scop form a subset of "domain". + */ +static __isl_give isl_union_set *extract_local_accesses(struct gpu_prog *prog, + __isl_keep isl_union_set *domain) +{ + int i; + isl_union_set *local; + + local = isl_union_set_empty(isl_union_set_get_space(domain)); + + for (i = 0; i < prog->n_array; ++i) { + isl_set *set; + isl_union_map *to_outer; + isl_union_map *may_write; + isl_union_set *write_domain; + isl_union_set *fields; + int subset; + + if (!prog->array[i].local) + continue; + + set = isl_set_universe(isl_space_copy(prog->array[i].space)); + to_outer = isl_union_map_copy(prog->to_outer); + to_outer = isl_union_map_intersect_range(to_outer, + isl_union_set_from_set(isl_set_copy(set))); + fields = isl_union_map_domain(to_outer); + may_write = isl_union_map_copy(prog->may_write); + may_write = isl_union_map_intersect_range(may_write, fields); + write_domain = isl_union_map_domain(may_write); + subset = isl_union_set_is_subset(write_domain, domain); + isl_union_set_free(write_domain); + + if (subset < 0) { + isl_set_free(set); + return isl_union_set_free(local); + } else if (subset) { + local = isl_union_set_add_set(local, set); + } else { + isl_set_free(set); + } + } + + return local; +} + +/* Internal data structure for node_may_persist. + * + * "tagger" maps tagged iteration domains to the corresponding untagged + * iteration domain. + * + * "may_persist_flow" is the set of all tagged dataflow dependences + * with those dependences removed that either precede or follow + * the kernel launch in a sequence. + * "inner_band_flow" is the set of all tagged dataflow dependences + * that are local to a given iteration of the outer band nodes + * with respect to the current node. + * "local_flow" is equal to "inner_band_flow", except that the domain + * and the range have been intersected with intermediate filters + * on children of sets or sequences. + */ +struct ppcg_may_persist_data { + isl_union_pw_multi_aff *tagger; + + isl_union_map *local_flow; + isl_union_map *inner_band_flow; + isl_union_map *may_persist_flow; +}; + +/* Update the information in "data" based on the band ancestor "node". + * + * In particular, we restrict the dependences in data->local_flow + * to those dependence where the source and the sink occur in + * the same iteration of the given band node. + * We also update data->inner_band_flow to the new value of + * data->local_flow. + */ +static int update_may_persist_at_band(__isl_keep isl_schedule_node *node, + struct ppcg_may_persist_data *data) +{ + isl_multi_union_pw_aff *partial; + isl_union_pw_multi_aff *contraction; + isl_union_map *flow; + + if (isl_schedule_node_band_n_member(node) == 0) + return 0; + + partial = isl_schedule_node_band_get_partial_schedule(node); + contraction = isl_schedule_node_get_subtree_contraction(node); + partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial, + contraction); + partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial, + isl_union_pw_multi_aff_copy(data->tagger)); + + flow = data->local_flow; + flow = isl_union_map_eq_at_multi_union_pw_aff(flow, partial); + data->local_flow = flow; + + isl_union_map_free(data->inner_band_flow); + data->inner_band_flow = isl_union_map_copy(data->local_flow); + + return 0; +} + +/* Given a set of local reaching domain elements "domain", + * expand them to the corresponding leaf domain elements using "contraction" + * and insert the array references tags using data->tagger. + */ +static __isl_give isl_union_set *expand_and_tag( + __isl_take isl_union_set *domain, + __isl_take isl_union_pw_multi_aff *contraction, + struct ppcg_may_persist_data *data) +{ + domain = isl_union_set_preimage_union_pw_multi_aff(domain, + contraction); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, + isl_union_pw_multi_aff_copy(data->tagger)); + return domain; +} + +/* Given a filter node that is the child of a set or sequence node, + * restrict data->local_flow to refer only to those elements + * in the filter of the node. + * "contraction" maps the leaf domain elements of the schedule tree + * to the corresponding domain elements at (the parent of) "node". + */ +static int filter_flow(__isl_keep isl_schedule_node *node, + struct ppcg_may_persist_data *data, + __isl_take isl_union_pw_multi_aff *contraction) +{ + isl_union_set *filter; + isl_union_map *flow; + + flow = data->local_flow; + filter = isl_schedule_node_filter_get_filter(node); + filter = expand_and_tag(filter, contraction, data); + flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(filter)); + flow = isl_union_map_intersect_range(flow, filter); + data->local_flow = flow; + + return 0; +} + +/* Given a filter node "node", collect the filters on all preceding siblings + * (which are also filter nodes), add them to "filters" and return the result. + */ +static __isl_give isl_union_set *add_previous_filters( + __isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node) +{ + isl_schedule_node *sibling; + + sibling = isl_schedule_node_copy(node); + while (sibling && isl_schedule_node_has_previous_sibling(sibling)) { + isl_union_set *filter; + + sibling = isl_schedule_node_previous_sibling(sibling); + filter = isl_schedule_node_filter_get_filter(sibling); + filters = isl_union_set_union(filters, filter); + } + isl_schedule_node_free(sibling); + if (!sibling) + return isl_union_set_free(filters); + + return filters; +} + +/* Given a filter node "node", collect the filters on all following siblings + * (which are also filter nodes), add them to "filters" and return the result. + */ +static __isl_give isl_union_set *add_next_filters( + __isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node) +{ + isl_schedule_node *sibling; + + sibling = isl_schedule_node_copy(node); + while (sibling && isl_schedule_node_has_next_sibling(sibling)) { + isl_union_set *filter; + + sibling = isl_schedule_node_next_sibling(sibling); + filter = isl_schedule_node_filter_get_filter(sibling); + filters = isl_union_set_union(filters, filter); + } + isl_schedule_node_free(sibling); + if (!sibling) + return isl_union_set_free(filters); + + return filters; +} + +/* Remove those flow dependences from data->may_persist_flow + * that flow between elements of "domain" within the same iteration + * of all outer band nodes. + * "contraction" maps the leaf domain elements of the schedule tree + * to the corresponding elements "domain". + */ +static void remove_external_flow(struct ppcg_may_persist_data *data, + __isl_take isl_union_set *domain, + __isl_keep isl_union_pw_multi_aff *contraction) +{ + isl_union_map *flow; + + contraction = isl_union_pw_multi_aff_copy(contraction); + domain = expand_and_tag(domain, contraction, data); + flow = isl_union_map_copy(data->local_flow); + flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(domain)); + flow = isl_union_map_intersect_range(flow, domain); + + data->may_persist_flow = isl_union_map_subtract(data->may_persist_flow, + flow); +} + +/* Update the information in "data" based on the filter ancestor "node". + * We only need to modify anything if the filter is the child + * of a set or sequence node. + * + * In the case of a sequence, we remove the dependences between + * statement instances that are both executed either before or + * after the subtree that will be mapped to a kernel, within + * the same iteration of outer bands. + * + * In both cases, we restrict data->local_flow to the current child. + */ +static int update_may_persist_at_filter(__isl_keep isl_schedule_node *node, + struct ppcg_may_persist_data *data) +{ + enum isl_schedule_node_type type; + isl_schedule_node *parent; + isl_space *space; + isl_union_pw_multi_aff *contraction; + isl_union_set *before, *after, *filter; + + type = isl_schedule_node_get_parent_type(node); + if (type != isl_schedule_node_sequence && type != isl_schedule_node_set) + return 0; + + parent = isl_schedule_node_copy(node); + parent = isl_schedule_node_parent(parent); + contraction = isl_schedule_node_get_subtree_contraction(parent); + isl_schedule_node_free(parent); + + if (type == isl_schedule_node_set) + return filter_flow(node, data, contraction); + + filter = isl_schedule_node_filter_get_filter(node); + space = isl_union_set_get_space(filter); + isl_union_set_free(filter); + before = isl_union_set_empty(space); + after = isl_union_set_copy(before); + before = add_previous_filters(before, node); + after = add_next_filters(after, node); + + remove_external_flow(data, before, contraction); + remove_external_flow(data, after, contraction); + + return filter_flow(node, data, contraction); +} + +/* Update the information in "data" based on the ancestor "node". + */ +static isl_stat update_may_persist_at(__isl_keep isl_schedule_node *node, + void *user) +{ + struct ppcg_may_persist_data *data = user; + + switch (isl_schedule_node_get_type(node)) { + case isl_schedule_node_error: + return isl_stat_error; + case isl_schedule_node_context: + case isl_schedule_node_domain: + case isl_schedule_node_expansion: + case isl_schedule_node_extension: + case isl_schedule_node_guard: + case isl_schedule_node_leaf: + case isl_schedule_node_mark: + case isl_schedule_node_sequence: + case isl_schedule_node_set: + break; + case isl_schedule_node_band: + if (update_may_persist_at_band(node, data) < 0) + return isl_stat_error; + break; + case isl_schedule_node_filter: + if (update_may_persist_at_filter(node, data) < 0) + return isl_stat_error; + break; + } + + return isl_stat_ok; +} + +/* Determine the set of array elements that may need to be perserved + * by a kernel constructed from the subtree at "node". + * This includes the set of array elements that may need to be preserved + * by the entire scop (prog->may_persist) and the elements for which + * there is a potential flow dependence that may cross a kernel launch. + * + * To determine the second set, we start from all flow dependences. + * From this set of dependences, we remove those that cannot possibly + * require data to be preserved by a kernel launch. + * In particular, we consider the following sets of dependences. + * - dependences of which the write occurs inside the kernel. + * If the data is needed outside the kernel, then it will + * be copied out immediately after the kernel launch, so there + * is no need for any special care. + * - dependences of which the read occurs inside the kernel and the + * corresponding write occurs inside the same iteration of the + * outer band nodes. This means that the data is needed in + * the first kernel launch after the write, which is already + * taken care of by the standard copy-in. That is, the data + * do not need to be preserved by any intermediate call to + * the same kernel. + * - dependences of which the write and the read either both occur + * before the kernel launch or both occur after the kernel launch, + * within the same iteration of the outer band nodes with respect + * to the sequence that determines the ordering of the dependence + * and the kernel launch. Such flow dependences cannot cross + * any kernel launch. + * + * For the remaining (tagged) dependences, we take the domain + * (i.e., the tagged writes) and apply the tagged access relation + * to obtain the accessed data elements. + * These are then combined with the elements that may need to be + * preserved by the entire scop. + */ +static __isl_give isl_union_set *node_may_persist( + __isl_keep isl_schedule_node *node, struct gpu_prog *prog) +{ + struct ppcg_may_persist_data data; + isl_union_pw_multi_aff *contraction; + isl_union_set *domain; + isl_union_set *persist; + isl_union_map *flow, *local_flow; + + data.tagger = prog->scop->tagger; + + flow = isl_union_map_copy(prog->scop->tagged_dep_flow); + data.local_flow = isl_union_map_copy(flow); + data.inner_band_flow = isl_union_map_copy(flow); + data.may_persist_flow = flow; + if (isl_schedule_node_foreach_ancestor_top_down(node, + &update_may_persist_at, &data) < 0) + data.may_persist_flow = + isl_union_map_free(data.may_persist_flow); + flow = data.may_persist_flow; + isl_union_map_free(data.local_flow); + + domain = isl_schedule_node_get_domain(node); + contraction = isl_schedule_node_get_subtree_contraction(node); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, + contraction); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, + isl_union_pw_multi_aff_copy(data.tagger)); + flow = isl_union_map_subtract_domain(flow, isl_union_set_copy(domain)); + local_flow = data.inner_band_flow; + local_flow = isl_union_map_intersect_range(local_flow, domain); + flow = isl_union_map_subtract(flow, local_flow); + + persist = isl_union_map_domain(flow); + persist = isl_union_set_apply(persist, + isl_union_map_copy(prog->scop->tagged_may_writes)); + persist = isl_union_set_union(persist, + isl_union_set_copy(prog->may_persist)); + + return persist; +} + +/* Add nodes for copying outer arrays in and out of the device + * before and after the subtree "node", which contains one or more kernels. + * "domain" contains the original statement instances, i.e., + * those that correspond to the domains of the access relations in "prog". + * In particular, the domain has not been contracted in any way. + * "prefix" contains the prefix schedule at that point, in terms + * of the same original statement instances. + * + * We first compute the sets of outer array elements that need + * to be copied in and out and then graft in the nodes for + * performing this copying. + * + * In particular, for each array that is possibly written anywhere in + * the subtree "node" and that may be used after "node" + * or that may be visible outside the corresponding scop, + * we copy out its entire extent. + * + * Any array elements that is read without first being written inside + * the subtree "node" needs to be copied in. + * Furthermore, if there are any array elements that + * are copied out, but that may not be written inside "node, then + * they also need to be copied in to ensure that the value after execution + * is the same as the value before execution, at least for those array + * elements that may have their values preserved by the scop or that + * may be written before "node" and read after "node". + * In case the array elements are structures, we need to take into + * account that all members of the structures need to be written + * by "node" before we can avoid copying the data structure in. + * + * Note that the may_write relation is intersected with the domain, + * which has been intersected with the context. + * This helps in those cases where the arrays are declared with a fixed size, + * while the accesses are parametric and the context assigns a fixed value + * to the parameters. + * + * If an element from a local array is read without first being written, + * then there is no point in copying it in since it cannot have been + * written prior to the scop. Warn about the uninitialized read instead. + */ +static __isl_give isl_schedule_node *add_to_from_device( + __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain, + __isl_take isl_union_map *prefix, struct gpu_prog *prog) +{ + isl_union_set *local; + isl_union_set *may_persist; + isl_union_map *may_write, *must_write, *copy_out, *not_written; + isl_union_map *read, *copy_in; + isl_union_map *tagged; + isl_union_map *local_uninitialized; + isl_schedule_node *graft; + + tagged = isl_union_map_copy(prog->scop->tagged_reads); + tagged = isl_union_map_union(tagged, + isl_union_map_copy(prog->scop->tagged_may_writes)); + + may_write = isl_union_map_copy(prog->may_write); + may_write = isl_union_map_intersect_domain(may_write, + isl_union_set_copy(domain)); + may_write = remove_local_accesses(prog, + isl_union_map_copy(tagged), may_write, + isl_union_map_copy(prefix), 0); + may_write = isl_union_map_apply_range(may_write, + isl_union_map_copy(prog->to_outer)); + may_write = isl_union_map_apply_domain(may_write, + isl_union_map_copy(prefix)); + may_write = approximate_copy_out(may_write, prog); + copy_out = isl_union_map_copy(may_write); + may_write = isl_union_map_apply_range(may_write, + isl_union_map_copy(prog->to_inner)); + must_write = isl_union_map_copy(prog->must_write); + must_write = isl_union_map_apply_domain(must_write, + isl_union_map_copy(prefix)); + may_persist = node_may_persist(node, prog); + may_write = isl_union_map_intersect_range(may_write, may_persist); + not_written = isl_union_map_subtract(may_write, must_write); + + local = extract_local_accesses(prog, domain); + read = isl_union_map_copy(prog->read); + read = isl_union_map_intersect_domain(read, domain); + read = remove_local_accesses(prog, tagged, read, + isl_union_map_copy(prefix), 1); + local = isl_union_set_apply(local, isl_union_map_copy(prog->to_inner)); + local_uninitialized = isl_union_map_copy(prog->scop->live_in); + local_uninitialized = isl_union_map_intersect_range(local_uninitialized, + local); + local_uninitialized = isl_union_map_intersect(local_uninitialized, + isl_union_map_copy(read)); + if (!isl_union_map_is_empty(local_uninitialized)) { + fprintf(stderr, + "possibly uninitialized reads (not copied in):\n"); + isl_union_map_dump(local_uninitialized); + } + read = isl_union_map_subtract(read, local_uninitialized); + read = isl_union_map_apply_domain(read, prefix); + copy_in = isl_union_map_union(read, not_written); + copy_in = isl_union_map_apply_range(copy_in, + isl_union_map_copy(prog->to_outer)); + + graft = create_copy_device(prog, node, "to_device", + isl_union_map_range(copy_in)); + node = isl_schedule_node_graft_before(node, graft); + graft = create_copy_device(prog, node, "from_device", + isl_union_map_range(copy_out)); + node = isl_schedule_node_graft_after(node, graft); + + return node; +} + +/* Add nodes for initializing ("init_device") and clearing ("clear_device") + * the device before and after "node". + */ +static __isl_give isl_schedule_node *add_init_clear_device( + __isl_take isl_schedule_node *node) +{ + isl_ctx *ctx; + isl_space *space; + isl_union_set *domain; + isl_schedule_node *graft; + + ctx = isl_schedule_node_get_ctx(node); + + space = isl_space_set_alloc(ctx, 0, 0); + space = isl_space_set_tuple_name(space, isl_dim_set, "init_device"); + domain = isl_union_set_from_set(isl_set_universe(space)); + graft = isl_schedule_node_from_domain(domain); + + node = isl_schedule_node_graft_before(node, graft); + + space = isl_space_set_alloc(ctx, 0, 0); + space = isl_space_set_tuple_name(space, isl_dim_set, "clear_device"); + domain = isl_union_set_from_set(isl_set_universe(space)); + graft = isl_schedule_node_from_domain(domain); + + node = isl_schedule_node_graft_after(node, graft); + + return node; +} + +/* Update "schedule" for mapping to a GPU device. + * + * In particular, insert a context node, create kernels for + * each outermost tilable band and introduce nodes for copying arrays + * in and out of the device and for initializing and clearing the device. + * If the child of the initial root points to a set node, + * then children of this node that do not contain any tilable bands + * are separated from the other children and are not mapped to + * the device. + * + * The GPU code is generated in a context where at least one + * statement instance is executed. The corresponding guard is inserted + * around the entire schedule. + */ +__isl_give isl_schedule *map_to_device(struct gpu_gen *gen, + __isl_take isl_schedule *schedule, int to_from_device) +{ + isl_schedule_node *node; + isl_set *context; + isl_set *guard; + isl_union_set *domain; + isl_union_map *prefix; + isl_union_pw_multi_aff *contraction; + struct gpu_prog *prog; + + context = isl_set_copy(gen->prog->context); + context = isl_set_from_params(context); + schedule = isl_schedule_insert_context(schedule, context); + + prog = gen->prog; + guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain)); + prog->context = isl_set_intersect(prog->context, isl_set_copy(guard)); + guard = isl_set_from_params(guard); + + node = isl_schedule_get_root(schedule); + isl_schedule_free(schedule); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + node = isolate_permutable_subtrees(node, gen->prog); + domain = isl_schedule_node_get_domain(node); + contraction = isl_schedule_node_get_subtree_contraction(node); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, + isl_union_pw_multi_aff_copy(contraction)); + prefix = isl_schedule_node_get_prefix_schedule_union_map(node); + prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, + contraction); + node = mark_kernels(gen, node); + if (to_from_device) { + node = add_to_from_device(node, domain, prefix, gen->prog); + } else { + isl_union_set_free(domain); + isl_union_map_free(prefix); + } + node = isl_schedule_node_root(node); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_insert_guard(node, guard); + node = isl_schedule_node_child(node, 0); + node = add_init_clear_device(node); + schedule = isl_schedule_node_get_schedule(node); + isl_schedule_node_free(node); + + return schedule; +} + +/* Internal data structure for extract_access. + * "next_access" points to the end of a linked list that is extended + * by extract_access. + * "single_expression" is set if the access expressions belong to + * an expression statement (i.e., a statement without internal control). + * "any_to_outer" maps all intermediate arrays to their outer arrays. + */ +struct ppcg_extract_access_data { + struct gpu_stmt_access **next_access; + int single_expression; + isl_union_map *any_to_outer; +}; + +/* Given a tagged access relation to a single array "tagged", extract it + * as a map, taking into account that the input may be empty. + * If the access relation is empty, then it does not contain + * any space information, so we try to recover it from the index + * expression. + * The space of the index expression is of the form I -> A, + * with I the statement instances and A the array, or [I -> F] -> A, + * with F the filters corresponding to arguments. + * We first drop F, if present, obtaining I -> A. + * Then we construct I -> R, with R the reference tag, + * combine the two into I -> [R -> A] and uncurry to obtain + * the final result [I -> R] -> A. + * Note that the index expression may have a lower dimension + * than that of the array, but this dimension is not used + * if the access relation is empty. + */ +static __isl_give isl_map *extract_single_tagged_access( + __isl_take isl_union_map *tagged, __isl_keep pet_expr *expr) +{ + int empty; + isl_id *id; + isl_space *space, *space2; + isl_multi_pw_aff *index; + + empty = isl_union_map_is_empty(tagged); + if (empty < 0) + goto error; + if (!empty) + return isl_map_from_union_map(tagged); + isl_union_map_free(tagged); + + index = pet_expr_access_get_index(expr); + space = isl_multi_pw_aff_get_space(index); + isl_multi_pw_aff_free(index); + if (isl_space_domain_is_wrapping(space)) + space = isl_space_domain_factor_domain(space); + space2 = isl_space_copy(space); + space2 = isl_space_from_domain(isl_space_domain(space)); + id = pet_expr_access_get_ref_id(expr); + space2 = isl_space_set_tuple_id(space2, isl_dim_out, id); + space = isl_space_range_product(space2, space); + space = isl_space_uncurry(space); + + return isl_map_empty(space); +error: + isl_union_map_free(tagged); + return NULL; +} + +/* Does the index expression "index" of "expr" represent an access + * to a single element? + * That is, is "index" completely specified? + * + * If "expr" accesses elements from different spaces (i.e., fields + * of a structure), then it does not access a single element. + * Otherwise, if the single space of the access matches the space + * of "index", then the index expression is completely specified + * (no pointer to a lower-dimensional slice of the accessed array) + * and a single element is being accessed. + */ +static isl_bool complete_index(__isl_keep pet_expr *expr, + __isl_keep isl_multi_pw_aff *index) +{ + isl_union_map *read, *write, *all; + isl_map *map; + isl_space *space1, *space2; + isl_bool complete; + + read = pet_expr_access_get_may_read(expr); + write = pet_expr_access_get_may_write(expr); + all = isl_union_map_union(read, write); + if (!all) + return isl_bool_error; + if (isl_union_map_n_map(all) != 1) { + isl_union_map_free(all); + return isl_bool_false; + } + map = isl_map_from_union_map(all); + space1 = isl_map_get_space(map); + isl_map_free(map); + space2 = isl_multi_pw_aff_get_space(index); + complete = isl_space_tuple_is_equal(space1, isl_dim_out, + space2, isl_dim_out); + isl_space_free(space1); + isl_space_free(space2); + + return complete; +} + +/* Does "expr" access a single, fixed element (independently of the statement + * instance)? + * That is, does it have a completely specified constant index expression? + * + * Note that it is not sufficient for the index expression to be + * piecewise constant. isl_multi_pw_aff_is_cst can therefore not be used. + */ +static isl_bool accesses_fixed_element(__isl_keep pet_expr *expr) +{ + int i, n; + isl_multi_pw_aff *index; + isl_bool fixed = isl_bool_true; + + index = pet_expr_access_get_index(expr); + if (index < 0) + return isl_bool_error; + n = isl_multi_pw_aff_dim(index, isl_dim_out); + for (i = 0; i < n; ++i) { + isl_pw_aff *pa; + + pa = isl_multi_pw_aff_get_pw_aff(index, 0); + fixed = isl_pw_aff_n_piece(pa) == 1; + if (fixed) + fixed = isl_pw_aff_is_cst(pa); + isl_pw_aff_free(pa); + if (fixed < 0 || !fixed) + break; + } + if (fixed >= 0 && fixed) + fixed = complete_index(expr, index); + isl_multi_pw_aff_free(index); + + return fixed; +} + +/* Extract a gpu_stmt_access from "expr", append it to the list + * that ends in *data->next_access and update the end of the list. + * If the access expression performs a write, then it is considered + * exact only if it appears in a single expression statement and + * if its may access relation is equal to its must access relation. + * + * The combined set of may accesses may be a union if member accesses + * are involved, but the entire set is derived from a single reference and + * therefore from a single index expression. These accesses therefore + * all map to the same outer array. + */ +static int extract_access(__isl_keep pet_expr *expr, void *user) +{ + struct ppcg_extract_access_data *data = user; + isl_union_map *tagged; + struct gpu_stmt_access *access; + isl_ctx *ctx = pet_expr_get_ctx(expr); + isl_multi_pw_aff *index; + + access = isl_alloc_type(ctx, struct gpu_stmt_access); + assert(access); + access->next = NULL; + access->read = pet_expr_access_is_read(expr); + access->write = pet_expr_access_is_write(expr); + tagged = pet_expr_access_get_tagged_may_read(expr); + tagged = isl_union_map_union(tagged, + pet_expr_access_get_tagged_may_write(expr)); + tagged = isl_union_map_apply_range(tagged, + isl_union_map_copy(data->any_to_outer)); + if (!access->write) { + access->exact_write = 1; + } else if (!data->single_expression) { + access->exact_write = 0; + } else { + isl_union_map *must, *may; + may = isl_union_map_copy(tagged); + may = isl_union_map_domain_factor_domain(may); + must = pet_expr_access_get_must_write(expr); + access->exact_write = isl_union_map_is_equal(must, may); + isl_union_map_free(must); + isl_union_map_free(may); + } + index = pet_expr_access_get_index(expr); + access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out); + isl_multi_pw_aff_free(index); + access->ref_id = pet_expr_access_get_ref_id(expr); + access->tagged_access = extract_single_tagged_access(tagged, expr); + access->access = isl_map_copy(access->tagged_access); + access->access = isl_map_domain_factor_domain(access->access); + access->fixed_element = accesses_fixed_element(expr); + + *data->next_access = access; + data->next_access = &(*data->next_access)->next; + + if (!access->access || access->fixed_element < 0) + return -1; + + return 0; +} + +/* Construct a linked list of gpu_stmt_access objects, + * one for each access expression in the statement body. + * "any_to_outer" maps all intermediate arrays to their outer arrays. + */ +static int pet_stmt_extract_accesses(struct gpu_stmt *stmt, + __isl_keep isl_union_map *any_to_outer) +{ + struct ppcg_extract_access_data data; + + stmt->accesses = NULL; + data.next_access = &stmt->accesses; + data.single_expression = + pet_tree_get_type(stmt->stmt->body) == pet_tree_expr; + data.any_to_outer = any_to_outer; + return pet_tree_foreach_access_expr(stmt->stmt->body, + &extract_access, &data); +} + +/* Has statement "stmt" been killed from "scop"? + * That is, is the instance set of "scop" free from any + * instances of "stmt"? + */ +static isl_bool is_stmt_killed(struct ppcg_scop *scop, struct pet_stmt *stmt) +{ + isl_space *space; + isl_set *left; + isl_bool empty; + + if (!scop || !stmt) + return isl_bool_error; + space = isl_set_get_space(stmt->domain); + left = isl_union_set_extract_set(scop->domain, space); + empty = isl_set_plain_is_empty(left); + isl_set_free(left); + + return empty; +} + +/* Return an array of gpu_stmt representing the statements in "scop". + * Do not collect array accesses for statements that have been killed. + */ +static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop, + __isl_keep isl_union_map *any_to_outer) +{ + int i; + struct gpu_stmt *stmts; + + stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->pet->n_stmt); + if (!stmts) + return NULL; + + for (i = 0; i < scop->pet->n_stmt; ++i) { + struct gpu_stmt *s = &stmts[i]; + isl_bool killed; + + s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain); + s->stmt = scop->pet->stmts[i]; + killed = is_stmt_killed(scop, scop->pet->stmts[i]); + if (killed < 0) + return free_stmts(stmts, i + 1); + if (killed) + continue; + if (pet_stmt_extract_accesses(s, any_to_outer) < 0) + return free_stmts(stmts, i + 1); + } + + return stmts; +} + +/* Generate CUDA code for "scop" and print it to "p". + * After generating an AST for the transformed scop as explained below, + * we call "gen->print" to print the AST in the desired output format + * to "p". + * + * If it turns out that it does not make sense to generate GPU code, + * then we generate CPU code instead. + * + * The declarations of the arrays that are visible outside of the scop + * are printed outside of the code generated from the schedule, + * because the generated code may involve a guard around the entire code. + * + * We first compute a schedule that respects the dependences + * of the original program and select the outermost bands + * of tilable dimensions that have at least one parallel loop. + * If the --load-schedule is specified, then the loaded schedule + * is used instead of a computed schedule. + * + * Each of these bands B is then tiled according to "tile" sizes, resulting + * in two nested bands, with a kernel marker on top + * + * K + * | + * T + * | + * P + * + * We then split off at most 2 parallel dimensions from the T band and + * at most 3 parallel dimension from the P band + * + * K + * | + * T + * T1 + * | + * T2 + * | + * P1 + * | + * P2 + * + * A filter is introduced in front of T1 that maps the domain instances + * to block identifiers. Similarly, a filter is introduced in front of P1 + * that maps the domain instances to thread identifiers. + * + * For each iteration of the T2 band and for each array, we compute + * the array elements accessed by that iteration, construct a rectangular + * box around it and shift it to the origin. The result is used + * as shared memory for the array. + * + * Copying and synchronization statements are added to this schedule tree. + * In principle, these are added in front of the P1 band, but some of + * them may get hoisted up to higher levels. + * + * The entire AST is then generated from the single resulting schedule tree. + * During the generation the subtrees at kernel nodes (K) are saved + * aside and replaced by kernel calls. The result is printed as host code + * while the saved subtrees are printed as device code. + */ +static __isl_give isl_printer *generate(__isl_take isl_printer *p, + struct gpu_gen *gen, struct ppcg_scop *scop, + struct ppcg_options *options) +{ + struct gpu_prog *prog; + isl_ctx *ctx; + isl_schedule *schedule; + int any_permutable; + + if (!scop) + return isl_printer_free(p); + + ctx = isl_printer_get_ctx(p); + prog = gpu_prog_alloc(ctx, scop); + if (!prog) + return isl_printer_free(p); + + gen->prog = prog; + schedule = get_schedule(gen); + + any_permutable = has_any_permutable_node(schedule); + if (any_permutable < 0 || !any_permutable) { + if (any_permutable < 0) + p = isl_printer_free(p); + else + p = print_cpu(p, scop, options); + isl_schedule_free(schedule); + } else { + const int create_to_from_device = 1; + schedule = map_to_device(gen, schedule, create_to_from_device); + gen->tree = generate_code(gen, schedule); + p = ppcg_set_macro_names(p); + p = ppcg_print_exposed_declarations(p, prog->scop); + p = gen->print(p, gen->prog, gen->tree, &gen->types, + gen->print_user); + isl_ast_node_free(gen->tree); + } + + gpu_prog_free(prog); + + return p; +} + +/* Wrapper around generate for use as a ppcg_transform callback. + */ +static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p, + struct ppcg_scop *scop, void *user) +{ + struct gpu_gen *gen = user; + + return generate(p, gen, scop, gen->options); +} + +/* Transform the code in the file called "input" by replacing + * all scops by corresponding GPU code and write the results to "out". + */ +int generate_gpu(isl_ctx *ctx, const char *input, FILE *out, + struct ppcg_options *options, + __isl_give isl_printer *(*print)(__isl_take isl_printer *p, + struct gpu_prog *prog, __isl_keep isl_ast_node *tree, + struct gpu_types *types, void *user), void *user) +{ + struct gpu_gen gen; + int r; + int i; + + gen.ctx = ctx; + gen.sizes = extract_sizes_from_str(ctx, options->sizes); + gen.options = options; + gen.kernel_id = 0; + gen.print = print; + gen.print_user = user; + gen.types.n = 0; + gen.types.name = NULL; + + if (options->debug->dump_sizes) { + isl_space *space = isl_space_params_alloc(ctx, 0); + gen.used_sizes = isl_union_map_empty(space); + } + + r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen); + + if (options->debug->dump_sizes) { + isl_union_map_dump(gen.used_sizes); + isl_union_map_free(gen.used_sizes); + } + + isl_union_map_free(gen.sizes); + for (i = 0; i < gen.types.n; ++i) + free(gen.types.name[i]); + free(gen.types.name); + + return r; +} + +/* Compute the set of inner array elements that may have their values + * preserved by "prog". In particular, collect the array elements of + * arrays that are not local to "prog" and remove those elements that + * are definitely killed or definitely written by "prog". + */ +__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog) +{ + int i; + isl_union_set *may_persist, *killed; + isl_union_map *must_kill; + + may_persist = isl_union_set_empty(isl_set_get_space(prog->context)); + for (i = 0; i < prog->n_array; ++i) { + isl_set *extent; + + if (prog->array[i].local) + continue; + + extent = isl_set_copy(prog->array[i].extent); + may_persist = isl_union_set_add_set(may_persist, extent); + } + + may_persist = isl_union_set_intersect_params(may_persist, + isl_set_copy(prog->context)); + may_persist = isl_union_set_apply(may_persist, + isl_union_map_copy(prog->to_inner)); + must_kill = isl_union_map_copy(prog->tagged_must_kill); + killed = isl_union_map_range(must_kill); + must_kill = isl_union_map_copy(prog->must_write); + killed = isl_union_set_union(killed, isl_union_map_range(must_kill)); + + may_persist = isl_union_set_subtract(may_persist, killed); + return may_persist; +} + +struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop) +{ + struct gpu_prog *prog; + isl_space *space; + isl_map *id; + + if (!scop) + return NULL; + + prog = isl_calloc_type(ctx, struct gpu_prog); + assert(prog); + + prog->ctx = ctx; + prog->scop = scop; + prog->context = isl_set_copy(scop->context); + prog->n_stmts = scop->pet->n_stmt; + prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet); + prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer); + space = isl_union_map_get_space(prog->any_to_outer); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, 1); + space = isl_space_map_from_set(space); + id = isl_map_identity(space); + prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id); + prog->stmts = extract_stmts(ctx, scop, prog->any_to_outer); + prog->read = isl_union_map_copy(scop->reads); + prog->may_write = isl_union_map_copy(scop->may_writes); + prog->must_write = isl_union_map_copy(scop->must_writes); + prog->tagged_must_kill = isl_union_map_copy(scop->tagged_must_kills); + prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet); + prog->to_outer = isl_union_map_copy(prog->to_inner); + prog->to_outer = isl_union_map_reverse(prog->to_outer); + + if (!prog->stmts) + return gpu_prog_free(prog); + + if (collect_array_info(prog) < 0) + return gpu_prog_free(prog); + prog->may_persist = compute_may_persist(prog); + + return prog; +} + +void *gpu_prog_free(struct gpu_prog *prog) +{ + if (!prog) + return NULL; + free_array_info(prog); + free_stmts(prog->stmts, prog->n_stmts); + isl_union_map_free(prog->any_to_outer); + isl_union_map_free(prog->to_outer); + isl_union_map_free(prog->to_inner); + isl_union_map_free(prog->read); + isl_union_map_free(prog->may_write); + isl_union_map_free(prog->must_write); + isl_union_map_free(prog->tagged_must_kill); + isl_union_map_free(prog->array_order); + isl_union_set_free(prog->may_persist); + isl_set_free(prog->context); + free(prog); + return NULL; +} diff --git a/polly/lib/External/ppcg/gpu_array_tile.h b/polly/lib/External/ppcg/gpu_array_tile.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_array_tile.h @@ -0,0 +1,59 @@ +#ifndef GPU_ARRAY_TILE_H +#define GPU_ARRAY_TILE_H + +#include +#include +#include + +/* The fields stride and shift only contain valid information + * if shift != NULL. + * If so, they express that current index is such that if you add shift, + * then the result is always a multiple of stride. + * Let D represent the initial tile->depth dimensions of the computed schedule. + * The spaces of "lb" and "shift" are of the form + * + * D -> [b] + */ +struct gpu_array_bound { + isl_val *size; + isl_aff *lb; + + isl_val *stride; + isl_aff *shift; +}; + +/* A tile of an outer array. + * + * requires_unroll is set if the schedule dimensions that are mapped + * to threads need to be unrolled for this (private) tile to be used. + * + * "depth" reflects the number of schedule dimensions that affect the tile. + * The copying into and/or out of the tile is performed at that depth. + * + * n is the dimension of the array. + * bound is an array of size "n" representing the lower bound + * and size for each index. + * + * tiling maps a tile in the global array to the corresponding + * shared/private memory tile and is of the form + * + * { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] } + * + * where D represents the initial "depth" dimensions + * of the computed schedule. + */ +struct gpu_array_tile { + isl_ctx *ctx; + int requires_unroll; + int depth; + int n; + struct gpu_array_bound *bound; + isl_multi_aff *tiling; +}; + +struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index); +struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile); + +__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile); + +#endif diff --git a/polly/lib/External/ppcg/gpu_array_tile.c b/polly/lib/External/ppcg/gpu_array_tile.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_array_tile.c @@ -0,0 +1,71 @@ +#include +#include + +#include "gpu_array_tile.h" + +struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile) +{ + int j; + + if (!tile) + return NULL; + + for (j = 0; j < tile->n; ++j) { + isl_val_free(tile->bound[j].size); + isl_val_free(tile->bound[j].stride); + isl_aff_free(tile->bound[j].lb); + isl_aff_free(tile->bound[j].shift); + } + free(tile->bound); + isl_multi_aff_free(tile->tiling); + free(tile); + + return NULL; +} + +/* Create a gpu_array_tile for an array of dimension "n_index". + */ +struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index) +{ + int i; + struct gpu_array_tile *tile; + + tile = isl_calloc_type(ctx, struct gpu_array_tile); + if (!tile) + return NULL; + + tile->ctx = ctx; + tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index); + if (!tile->bound) + return gpu_array_tile_free(tile); + + tile->n = n_index; + + for (i = 0; i < n_index; ++i) { + tile->bound[i].size = NULL; + tile->bound[i].lb = NULL; + tile->bound[i].stride = NULL; + tile->bound[i].shift = NULL; + } + + return tile; +} + +/* Compute the size of the tile specified by "tile" + * in number of elements and return the result. + */ +__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile) +{ + int i; + isl_val *size; + + if (!tile) + return NULL; + + size = isl_val_one(tile->ctx); + + for (i = 0; i < tile->n; ++i) + size = isl_val_mul(size, isl_val_copy(tile->bound[i].size)); + + return size; +} diff --git a/polly/lib/External/ppcg/gpu_group.h b/polly/lib/External/ppcg/gpu_group.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_group.h @@ -0,0 +1,65 @@ +#ifndef GPU_GROUP_H +#define GPU_GROUP_H + +#include +#include "gpu.h" + +/* A group of array references in a kernel that should be handled together. + * If private_tile is not NULL, then it is mapped to registers. + * Otherwise, if shared_tile is not NULL, it is mapped to shared memory. + * Otherwise, it is accessed from global memory. + * Note that if both private_tile and shared_tile are set, then shared_tile + * is only used inside group_common_shared_memory_tile. + */ +struct gpu_array_ref_group { + /* The references in this group access this local array. */ + struct gpu_local_array_info *local_array; + /* This is the corresponding array. */ + struct gpu_array_info *array; + /* Position of this group in the list of reference groups of array. */ + int nr; + + /* The following fields are use during the construction of the groups. + * access is the combined access relation relative to the private + * memory tiling. In particular, the domain of the map corresponds + * to the first thread_depth dimensions of the kernel schedule. + * write is set if any access in the group is a write. + * exact_write is set if all writes are definite writes. + * slice is set if there is at least one access in the group + * that refers to more than one element + * "min_depth" is the minimum of the tile depths and thread_depth. + */ + isl_map *access; + int write; + int exact_write; + int slice; + int min_depth; + + /* The shared memory tile, NULL if none. */ + struct gpu_array_tile *shared_tile; + + /* The private memory tile, NULL if none. */ + struct gpu_array_tile *private_tile; + + /* References in this group; point to elements of a linked list. */ + int n_ref; + struct gpu_stmt_access **refs; +}; + +int gpu_group_references(struct ppcg_kernel *kernel, + __isl_keep isl_schedule_node *node); + +__isl_give isl_printer *gpu_array_ref_group_print_name( + struct gpu_array_ref_group *group, __isl_take isl_printer *p); +void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group); +__isl_give isl_union_map *gpu_array_ref_group_access_relation( + struct gpu_array_ref_group *group, int read, int write); +int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group); +enum ppcg_group_access_type gpu_array_ref_group_type( + struct gpu_array_ref_group *group); +struct gpu_array_tile *gpu_array_ref_group_tile( + struct gpu_array_ref_group *group); +struct gpu_array_ref_group *gpu_array_ref_group_free( + struct gpu_array_ref_group *group); + +#endif diff --git a/polly/lib/External/ppcg/gpu_group.c b/polly/lib/External/ppcg/gpu_group.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_group.c @@ -0,0 +1,1828 @@ +/* + * Copyright 2010-2011 INRIA Saclay + * Copyright 2012-2014 Ecole Normale Superieure + * Copyright 2015 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, + * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, + * 91893 Orsay, France + * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include +#include + +#include "gpu_array_tile.h" +#include "gpu_group.h" +#include "gpu_tree.h" +#include "schedule.h" + +/* Print the name of the local copy of a given group of array references. + */ +__isl_give isl_printer *gpu_array_ref_group_print_name( + struct gpu_array_ref_group *group, __isl_take isl_printer *p) +{ + int global = 0; + enum ppcg_group_access_type type; + + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_private) + p = isl_printer_print_str(p, "private_"); + else if (type == ppcg_access_shared) + p = isl_printer_print_str(p, "shared_"); + else + global = 1; + p = isl_printer_print_str(p, group->array->name); + if (!global && group->local_array->n_group > 1) { + p = isl_printer_print_str(p, "_"); + p = isl_printer_print_int(p, group->nr); + } + + return p; +} + +/* Return the union of all read (read = 1) and/or write (write = 1) + * access relations in the group. + */ +__isl_give isl_union_map *gpu_array_ref_group_access_relation( + struct gpu_array_ref_group *group, int read, int write) +{ + int i; + isl_union_map *access; + + access = isl_union_map_empty(isl_map_get_space(group->access)); + for (i = 0; i < group->n_ref; ++i) { + isl_map *map_i; + + if (!((read && group->refs[i]->read) || + (write && group->refs[i]->write))) + continue; + map_i = isl_map_copy(group->refs[i]->access); + access = isl_union_map_union(access, + isl_union_map_from_map(map_i)); + } + + return access; +} + +/* Should this array reference group be mapped to private, shared or global + * memory? + * If we have computed both a private and a shared tile, then + * the tile with the smallest depth is used. If both have the same depth, + * then the private tile is used. + */ +enum ppcg_group_access_type gpu_array_ref_group_type( + struct gpu_array_ref_group *group) +{ + if (group->private_tile && group->shared_tile && + group->shared_tile->depth < group->private_tile->depth) + return ppcg_access_shared; + if (group->private_tile) + return ppcg_access_private; + if (group->shared_tile) + return ppcg_access_shared; + return ppcg_access_global; +} + + +/* Return the effective gpu_array_tile associated to "group" or + * NULL if there is no such gpu_array_tile. + */ +struct gpu_array_tile *gpu_array_ref_group_tile( + struct gpu_array_ref_group *group) +{ + switch (gpu_array_ref_group_type(group)) { + case ppcg_access_global: + return NULL; + case ppcg_access_shared: + return group->shared_tile; + case ppcg_access_private: + return group->private_tile; + } +} + +/* Does the tile associated to "group" require unrolling of the schedule + * dimensions mapped to threads? + * Note that this can only happen for private tiles. + */ +int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group) +{ + struct gpu_array_tile *tile; + + tile = gpu_array_ref_group_tile(group); + if (!tile) + return 0; + return tile->requires_unroll; +} + +/* Given a constraint + * + * a(p,i) + j = g f(e) + * + * or -a(p,i) - j = g f(e) if sign < 0, + * store a(p,i) in bound->shift and g (stride) in bound->stride. + * a(p,i) is assumed to be an expression in only the parameters + * and the input dimensions. + */ +static void extract_stride(__isl_keep isl_constraint *c, + struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign) +{ + int i; + isl_val *v; + isl_space *space; + unsigned nparam; + unsigned nvar; + isl_aff *aff; + + isl_val_free(bound->stride); + bound->stride = isl_val_copy(stride); + + space = isl_constraint_get_space(c); + space = isl_space_domain(space); + + nparam = isl_space_dim(space, isl_dim_param); + nvar = isl_space_dim(space, isl_dim_set); + + v = isl_constraint_get_constant_val(c); + if (sign < 0) + v = isl_val_neg(v); + aff = isl_aff_zero_on_domain(isl_local_space_from_space(space)); + aff = isl_aff_set_constant_val(aff, v); + + for (i = 0; i < nparam; ++i) { + if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1)) + continue; + v = isl_constraint_get_coefficient_val(c, isl_dim_param, i); + if (sign < 0) + v = isl_val_neg(v); + aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v); + } + + for (i = 0; i < nvar; ++i) { + if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1)) + continue; + v = isl_constraint_get_coefficient_val(c, isl_dim_in, i); + if (sign < 0) + v = isl_val_neg(v); + aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v); + } + + bound->shift = aff; +} + +/* Given an equality constraint of a map with a single output dimension j, + * check if the constraint is of the form + * + * a(p,i) + j = g f(e) + * + * with a(p,i) an expression in the parameters and input dimensions + * and f(e) an expression in the existentially quantified variables. + * If so, and if g is larger than any such g from a previously considered + * constraint, then call extract_stride to record the stride information + * in bound. + */ +static isl_stat check_stride_constraint(__isl_take isl_constraint *c, + void *user) +{ + int i; + isl_ctx *ctx; + isl_val *v; + unsigned n_div; + struct gpu_array_bound *bound = user; + + ctx = isl_constraint_get_ctx(c); + n_div = isl_constraint_dim(c, isl_dim_div); + v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0); + + if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) { + int s = isl_val_sgn(v); + isl_val *stride = isl_val_zero(ctx); + + isl_val_free(v); + for (i = 0; i < n_div; ++i) { + v = isl_constraint_get_coefficient_val(c, + isl_dim_div, i); + stride = isl_val_gcd(stride, v); + } + if (!isl_val_is_zero(stride) && + isl_val_gt(stride, bound->stride)) + extract_stride(c, bound, stride, s); + + isl_val_free(stride); + } else + isl_val_free(v); + + isl_constraint_free(c); + return isl_stat_ok; +} + +/* Given contraints on an array index i, check if we can find + * a shift a(p) and a stride g such that + * + * a(p) + i = 0 mod g + * + * If so, record the information in bound and apply the mapping + * i -> (i + a(p))/g to the array index in bounds and return + * the new constraints. + * If not, simply return the original constraints. + * + * If bounds is a subset of the space + * + * D -> i + * + * then the bound recorded in bound->shift is of the form + * + * D -> s(D) + * + * with s(D) equal to a(p) above. + * Next, we construct a mapping of the form + * + * [D -> i] -> [D -> (i + S(D))/g] + * + * This mapping is computed as follows. + * We first introduce "i" in the domain through precomposition + * with [D -> i] -> D obtaining + * + * [D -> i] -> s(D) + * + * Adding [D -> i] -> i produces + * + * [D -> i] -> i + s(D) + * + * and the domain product with [D -> i] -> D yields + * + * [D -> i] -> [D -> i + s(D)] + * + * Composition with [D -> i] -> [D -> i/g] gives the desired result. + */ +static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound, + __isl_take isl_basic_map *bounds) +{ + isl_space *space; + isl_basic_map *hull; + isl_basic_map *shift, *id, *bmap, *scale; + isl_basic_set *bset; + isl_aff *aff; + + bound->stride = NULL; + + hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds)); + + isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound); + + isl_basic_map_free(hull); + + if (!bound->stride) + return bounds; + + shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift)); + space = isl_basic_map_get_space(bounds); + bmap = isl_basic_map_domain_map(isl_basic_map_universe(space)); + shift = isl_basic_map_apply_range(bmap, shift); + space = isl_basic_map_get_space(bounds); + id = isl_basic_map_range_map(isl_basic_map_universe(space)); + shift = isl_basic_map_sum(id, shift); + space = isl_basic_map_get_space(bounds); + id = isl_basic_map_domain_map(isl_basic_map_universe(space)); + shift = isl_basic_map_range_product(id, shift); + + space = isl_space_domain(isl_basic_map_get_space(bounds)); + id = isl_basic_map_identity(isl_space_map_from_set(space)); + space = isl_space_range(isl_basic_map_get_space(bounds)); + aff = isl_aff_zero_on_domain(isl_local_space_from_space(space)); + aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1); + aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride)); + scale = isl_basic_map_from_aff(aff); + scale = isl_basic_map_product(id, scale); + + bmap = isl_basic_map_apply_range(shift, scale); + bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap); + bounds = isl_basic_set_unwrap(bset); + + return bounds; +} + +/* Data used in compute_array_dim_size and compute_size_in_direction. + * + * pos is the position of the variable representing the array index, + * i.e., the variable for which want to compute the size. This variable + * is also the last variable in the set. + */ +struct gpu_size_info { + isl_basic_set *bset; + struct gpu_array_bound *bound; + int pos; +}; + +/* Given a constraint from the basic set describing the bounds on + * an array index, check if it is a lower bound, say m i >= b(x), and, + * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant + * upper bound. If so, and if this bound is smaller than any bound + * derived from earlier constraints, set the size to this bound on + * the expression and the lower bound to ceil(b(x)/m). + */ +static isl_stat compute_size_in_direction(__isl_take isl_constraint *c, + void *user) +{ + struct gpu_size_info *size = user; + unsigned nparam; + unsigned n_div; + isl_val *v; + isl_aff *aff; + isl_aff *lb; + + nparam = isl_basic_set_dim(size->bset, isl_dim_param); + n_div = isl_constraint_dim(c, isl_dim_div); + + if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) || + !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) { + isl_constraint_free(c); + return isl_stat_ok; + } + + aff = isl_constraint_get_bound(c, isl_dim_set, size->pos); + aff = isl_aff_ceil(aff); + + lb = isl_aff_copy(aff); + + aff = isl_aff_neg(aff); + aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1); + + v = isl_basic_set_max_val(size->bset, aff); + isl_aff_free(aff); + + if (isl_val_is_int(v)) { + v = isl_val_add_ui(v, 1); + if (!size->bound->size || isl_val_lt(v, size->bound->size)) { + isl_val_free(size->bound->size); + size->bound->size = isl_val_copy(v); + lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1); + isl_aff_free(size->bound->lb); + size->bound->lb = isl_aff_copy(lb); + } + } + isl_val_free(v); + isl_aff_free(lb); + + isl_constraint_free(c); + + return isl_stat_ok; +} + +/* Given a basic map "bounds" that maps parameters and input dimensions + * to a single output dimension, look for an expression in the parameters + * and input dimensions such that the range of the output dimension shifted + * by this expression is a constant. + * + * In particular, we currently only consider lower bounds on the output + * dimension as candidate expressions. + */ +static int compute_array_dim_size(struct gpu_array_bound *bound, + __isl_take isl_basic_map *bounds) +{ + struct gpu_size_info size; + + bounds = isl_basic_map_detect_equalities(bounds); + bounds = check_stride(bound, bounds); + + bound->size = NULL; + bound->lb = NULL; + + size.bound = bound; + size.pos = isl_basic_map_dim(bounds, isl_dim_in); + size.bset = isl_basic_map_wrap(bounds); + size.bset = isl_basic_set_flatten(size.bset); + size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset)); + isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction, + &size); + isl_basic_set_free(size.bset); + + return bound->size ? 0 : -1; +} + +/* Check if we can find a memory tile for the given array + * based on the given accesses, and if so, put the results in "tile". + * + * We project the accesses on each index in turn and look for a parametric + * offset such that the size is constant. + * + * tile->depth is initialized to the input dimension of the computed bounds. + */ +static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile) +{ + int i; + + tile->depth = isl_map_dim(access, isl_dim_in); + + for (i = 0; i < tile->n; ++i) { + isl_map *access_i; + isl_basic_map *hull; + + access_i = isl_map_copy(access); + access_i = isl_map_project_out(access_i, isl_dim_out, 0, i); + access_i = isl_map_project_out(access_i, isl_dim_out, + 1, tile->n - (i + 1)); + access_i = isl_map_compute_divs(access_i); + hull = isl_map_simple_hull(access_i); + if (compute_array_dim_size(&tile->bound[i], hull) < 0) + return 0; + } + + return 1; +} + +/* Internal data structure for gpu_group_references. + * + * scop represents the input scop. + * kernel_depth is the schedule depth where the kernel launch will + * be introduced, i.e., it is the depth of the band that is mapped + * to blocks. + * shared_depth is the schedule depth at which the copying to/from + * shared memory is computed. The copy operation may then + * later be hoisted to a higher level. + * thread_depth is the schedule depth where the thread mark is located, + * i.e., it is the depth of the band that is mapped to threads and also + * the schedule depth at which the copying to/from private memory + * is computed. The copy operation may then later be hoisted to + * a higher level. + * n_thread is the number of schedule dimensions in the band that + * is mapped to threads. + * privatization lives in the range of thread_sched (i.e., it is + * of dimension thread_depth + n_thread) and encodes the mapping + * to thread identifiers (as parameters). + * host_sched contains the kernel_depth dimensions of the host schedule. + * shared_sched contains the first shared_depth dimensions of the + * kernel schedule. + * copy_sched contains the first thread_depth dimensions of the + * kernel schedule. + * thread_sched contains the first (thread_depth + n_thread) dimensions + * of the kernel schedule. + * full_sched is a union_map representation of the entire kernel schedule. + * The schedules are all formulated in terms of the original statement + * instances, i.e., those that appear in the domains of the access + * relations. + */ +struct gpu_group_data { + struct ppcg_scop *scop; + int kernel_depth; + int shared_depth; + int thread_depth; + int n_thread; + isl_set *privatization; + isl_union_map *host_sched; + isl_union_map *shared_sched; + isl_union_map *copy_sched; + isl_union_map *thread_sched; + isl_union_map *full_sched; +}; + +/* Construct a map from domain_space to domain_space that increments + * the dimension at position "pos" and leaves all other dimensions + * constant. + */ +static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos) +{ + isl_space *space; + isl_aff *aff; + isl_multi_aff *next; + + space = isl_space_map_from_set(domain_space); + next = isl_multi_aff_identity(space); + aff = isl_multi_aff_get_aff(next, pos); + aff = isl_aff_add_constant_si(aff, 1); + next = isl_multi_aff_set_aff(next, pos, aff); + + return isl_map_from_multi_aff(next); +} + +/* Check if the given access is coalesced (or if there is no point + * in trying to coalesce the access by mapping the array to shared memory). + * That is, check whether incrementing the dimension that will get + * wrapped over the last thread index results in incrementing + * the last array index. + * + * If no two consecutive array elements are ever accessed by "access", + * then mapping the corresponding array to shared memory will not + * improve coalescing. In fact, the copying will likely be performed + * by a single thread. Consider the access as coalesced such that + * the caller will not try and map the array to shared memory just + * to improve coalescing. + * + * This function is only called for access relations without reuse and + * kernels with at least one thread identifier. + */ +static int access_is_coalesced(struct gpu_group_data *data, + __isl_keep isl_union_map *access) +{ + int dim; + isl_space *space; + isl_set *accessed; + isl_map *access_map; + isl_map *next_thread_x; + isl_map *next_element; + isl_map *map; + int coalesced, empty; + + access = isl_union_map_copy(access); + access = isl_union_map_apply_domain(access, + isl_union_map_copy(data->full_sched)); + access_map = isl_map_from_union_map(access); + + space = isl_map_get_space(access_map); + space = isl_space_range(space); + dim = isl_space_dim(space, isl_dim_set); + if (dim == 0) + next_element = isl_map_empty(isl_space_map_from_set(space)); + else + next_element = next(space, dim - 1); + + accessed = isl_map_range(isl_map_copy(access_map)); + map = isl_map_copy(next_element); + map = isl_map_intersect_domain(map, isl_set_copy(accessed)); + map = isl_map_intersect_range(map, accessed); + empty = isl_map_is_empty(map); + isl_map_free(map); + + if (empty < 0 || empty) { + isl_map_free(next_element); + isl_map_free(access_map); + return empty; + } + + space = isl_map_get_space(access_map); + space = isl_space_domain(space); + next_thread_x = next(space, data->thread_depth + data->n_thread - 1); + + map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map)); + map = isl_map_apply_range(map, access_map); + + coalesced = isl_map_is_subset(map, next_element); + + isl_map_free(next_element); + isl_map_free(map); + + return coalesced; +} + +/* Replace the host schedule dimensions in the access relation "access" + * by parameters, so that they are treated as fixed when checking for reuse + * (within a kernel) or whether two consecutive elements are accessed + * (within a kernel). + */ +static __isl_give isl_union_map *localize_access(struct gpu_group_data *data, + __isl_take isl_union_map *access) +{ + int n; + isl_space *space; + isl_set *param; + isl_union_map *umap; + isl_id_list *ids; + + umap = isl_union_map_copy(data->host_sched); + space = isl_union_map_get_space(umap); + n = data->kernel_depth; + ids = ppcg_scop_generate_names(data->scop, n, "__ppcg_host_"); + param = parametrization(space, n, 0, ids); + isl_id_list_free(ids); + umap = isl_union_map_intersect_range(umap, + isl_union_set_from_set(param)); + access = isl_union_map_intersect_domain(access, + isl_union_map_domain(umap)); + + return access; +} + +/* Given an access relation in terms of at least data->thread_depth initial + * dimensions of the computed schedule, check if it is bijective for + * fixed values of the first data->thread_depth dimensions. + * We perform this check by equating these dimensions to parameters. + */ +static int access_is_bijective(struct gpu_group_data *data, + __isl_keep isl_map *access) +{ + int res; + int dim; + isl_set *par; + isl_space *space; + isl_id_list *ids; + + access = isl_map_copy(access); + space = isl_space_params(isl_map_get_space(access)); + ids = ppcg_scop_generate_names(data->scop, data->thread_depth, "s"); + dim = isl_map_dim(access, isl_dim_in); + par = parametrization(space, dim, 0, ids); + isl_id_list_free(ids); + access = isl_map_intersect_domain(access, par); + res = isl_map_is_bijective(access); + isl_map_free(access); + + return res; +} + +/* Compute the number of outer schedule tile dimensions that affect + * the offset of "tile". + * If there is no such dimension, then return the index + * of the first kernel dimension, i.e., data->kernel_depth. + */ +static int compute_tile_depth(struct gpu_group_data *data, + struct gpu_array_tile *tile) +{ + int i, j; + + for (j = tile->depth - 1; j >= data->kernel_depth; --j) { + for (i = 0; i < tile->n; ++i) { + isl_aff *lb; + isl_aff *shift; + + lb = tile->bound[i].lb; + if (isl_aff_involves_dims(lb, isl_dim_in, j, 1)) + break; + + shift = tile->bound[i].shift; + if (!shift) + continue; + if (isl_aff_involves_dims(shift, isl_dim_in, j, 1)) + break; + } + if (i < tile->n) + break; + } + + return ++j; +} + +/* Return the lowest depth between data->kernel_depth and data->thread_depth + * at which every array element accessed through "acc" is accessed + * by a single thread. The input dimension of "acc" is + * data->thread_depth + data->n_thread, where the final data->n_thread + * dimensions are those that will be mapped to threads. + * If the values for these dimensions are uniquely determined + * by the array index and a given number of outer dimensions, then + * there is only one thread accessing that array element within those + * outer dimensions. + * + * The input space of "acc" is first split up, such that it has the form + * + * [O -> T] -> A + * + * with O the outer dimensions, T the dimensions that will be mapped to threads + * and A the array index. + * + * Then the positions of T and A are interchanged to simplify the test + * whether T uniquely depends on O and A. + * In particular, the above access relation is first combined with + * + * [O -> T] -> T + * + * to form + * + * [O -> T] -> [A -> T] + * + * from which + * + * O -> [A -> T] + * + * is extracted, which is then uncurried to + * + * [O -> A] -> T + * + * Finally, the final dimensions of O are projected out one by one + * until T is no longer uniquely determined by A and the remaining + * dimensions in O. The value returned is that of the last dimension + * that was successfully projected out. + * Note that there is no need to test whether [O -> A] -> T itself + * is single-valued as that was already tested in access_is_bijective. + */ +static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data, + __isl_keep isl_map *acc) +{ + int i; + isl_space *space; + isl_map *map; + isl_bool sv; + + if (data->thread_depth == data->kernel_depth) + return data->thread_depth; + + acc = isl_map_copy(acc); + + space = isl_map_get_space(acc); + space = isl_space_params(space); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, data->thread_depth); + space = isl_space_from_domain(space); + space = isl_space_add_dims(space, isl_dim_out, data->n_thread); + space = isl_space_wrap(space); + map = isl_set_flatten_map(isl_set_universe(space)); + acc = isl_map_apply_range(map, acc); + + space = isl_space_domain(isl_map_get_space(acc)); + map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space))); + acc = isl_map_range_product(acc, map); + acc = isl_map_domain_factor_domain(acc); + acc = isl_map_uncurry(acc); + + for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) { + acc = isl_map_project_out(acc, isl_dim_in, i, 1); + sv = isl_map_is_single_valued(acc); + if (sv < 0) + return -1; + if (!sv) + break; + } + + isl_map_free(acc); + + return ++i; +} + +/* Adjust the fields of "tile" to reflect the new input dimension "depth". + * The dimension beyond "depth" are assumed not to affect the tile, + * so they can simply be dropped. + */ +static int tile_adjust_depth(struct gpu_array_tile *tile, int depth) +{ + int i; + + if (tile->depth == depth) + return 0; + + for (i = 0; i < tile->n; ++i) { + tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb, + isl_dim_in, depth, tile->depth - depth); + if (!tile->bound[i].lb) + return -1; + if (!tile->bound[i].shift) + continue; + tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift, + isl_dim_in, depth, tile->depth - depth); + if (!tile->bound[i].shift) + return -1; + } + + tile->depth = depth; + + return 0; +} + +/* Determine the number of schedule dimensions that affect the offset of the + * shared or private tile "tile" and store the result in tile->depth, with + * a lower bound of data->kernel_depth. + * Also adjust the fields of the tile to only refer to the tile->depth + * outer schedule dimensions. + */ +static isl_stat tile_set_depth(struct gpu_group_data *data, + struct gpu_array_tile *tile) +{ + if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0) + return isl_stat_error; + + return isl_stat_ok; +} + +/* Determine the number of schedule dimensions that affect the offset of the + * shared tile and store the minimum of the private and shared tile depth + * in group->min_depth, with a lower bound of data->kernel_depth. + * If there is no tile defined on the array reference group, + * then set group->min_depth to data->thread_depth. + */ +static int set_depth(struct gpu_group_data *data, + struct gpu_array_ref_group *group) +{ + group->min_depth = data->thread_depth; + + if (group->private_tile) { + if (group->private_tile->depth < group->min_depth) + group->min_depth = group->private_tile->depth; + } + if (group->shared_tile) { + if (tile_set_depth(data, group->shared_tile) < 0) + return -1; + if (group->shared_tile->depth < group->min_depth) + group->min_depth = group->shared_tile->depth; + } + + return 0; +} + +/* Fill up the groups array with singleton groups, i.e., one group + * per reference, initializing the array, access, write, n_ref and refs fields. + * In particular the access field is initialized to the scheduled + * access relation of the array reference. + * + * Return the number of elements initialized, i.e., the number of + * active references in the current kernel. + */ +static int populate_array_references(struct gpu_local_array_info *local, + struct gpu_array_ref_group **groups, struct gpu_group_data *data) +{ + int i; + int n; + isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched); + + n = 0; + for (i = 0; i < local->array->n_ref; ++i) { + isl_union_map *umap; + isl_map *map; + struct gpu_array_ref_group *group; + struct gpu_stmt_access *access = local->array->refs[i]; + + map = isl_map_copy(access->access); + umap = isl_union_map_from_map(map); + umap = isl_union_map_apply_domain(umap, + isl_union_map_copy(data->copy_sched)); + + if (isl_union_map_is_empty(umap)) { + isl_union_map_free(umap); + continue; + } + + map = isl_map_from_union_map(umap); + map = isl_map_detect_equalities(map); + + group = isl_calloc_type(ctx, struct gpu_array_ref_group); + if (!group) + return -1; + group->local_array = local; + group->array = local->array; + group->access = map; + group->write = access->write; + group->exact_write = access->exact_write; + group->slice = access->n_index < local->array->n_index; + group->refs = &local->array->refs[i]; + group->n_ref = 1; + + groups[n++] = group; + } + + return n; +} + +/* If group->n_ref == 1, then group->refs was set by + * populate_array_references to point directly into + * group->array->refs and should not be freed. + * If group->n_ref > 1, then group->refs was set by join_groups + * to point to a newly allocated array. + */ +struct gpu_array_ref_group *gpu_array_ref_group_free( + struct gpu_array_ref_group *group) +{ + if (!group) + return NULL; + gpu_array_tile_free(group->shared_tile); + gpu_array_tile_free(group->private_tile); + isl_map_free(group->access); + if (group->n_ref > 1) + free(group->refs); + free(group); + return NULL; +} + +/* Check if the access relations of group1 and group2 overlap within + * copy_sched. + */ +static int accesses_overlap(struct gpu_array_ref_group *group1, + struct gpu_array_ref_group *group2) +{ + int disjoint; + + disjoint = isl_map_is_disjoint(group1->access, group2->access); + if (disjoint < 0) + return -1; + + return !disjoint; +} + +/* Combine the given two groups into a single group, containing + * the references of both groups. + */ +static struct gpu_array_ref_group *join_groups( + struct gpu_array_ref_group *group1, + struct gpu_array_ref_group *group2) +{ + int i; + isl_ctx *ctx; + struct gpu_array_ref_group *group; + + if (!group1 || !group2) + return NULL; + + ctx = isl_map_get_ctx(group1->access); + group = isl_calloc_type(ctx, struct gpu_array_ref_group); + if (!group) + return NULL; + group->local_array = group1->local_array; + group->array = group1->array; + group->access = isl_map_union(isl_map_copy(group1->access), + isl_map_copy(group2->access)); + group->write = group1->write || group2->write; + group->exact_write = group1->exact_write && group2->exact_write; + group->slice = group1->slice || group2->slice; + group->n_ref = group1->n_ref + group2->n_ref; + group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *, + group->n_ref); + if (!group->refs) + return gpu_array_ref_group_free(group); + for (i = 0; i < group1->n_ref; ++i) + group->refs[i] = group1->refs[i]; + for (i = 0; i < group2->n_ref; ++i) + group->refs[group1->n_ref + i] = group2->refs[i]; + + return group; +} + +/* Combine the given two groups into a single group and free + * the original two groups. + */ +static struct gpu_array_ref_group *join_groups_and_free( + struct gpu_array_ref_group *group1, + struct gpu_array_ref_group *group2) +{ + struct gpu_array_ref_group *group; + + group = join_groups(group1, group2); + gpu_array_ref_group_free(group1); + gpu_array_ref_group_free(group2); + return group; +} + +/* Report that the array reference group with the given access relation + * is not mapped to shared memory in the given kernel because + * it does not exhibit any reuse and is considered to be coalesced. + */ +static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel, + __isl_keep isl_union_map *access) +{ + isl_ctx *ctx; + isl_printer *p; + + ctx = isl_union_map_get_ctx(access); + p = isl_printer_to_file(ctx, stdout); + p = isl_printer_print_str(p, "Array reference group "); + p = isl_printer_print_union_map(p, access); + p = isl_printer_print_str(p, + " not considered for mapping to shared memory in kernel"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, + " because it exhibits no reuse and is considered to be coalesced"); + p = isl_printer_end_line(p); + isl_printer_free(p); +} + +/* Given an access relation in terms of the data->thread_depth initial + * dimensions of the computed schedule and the thread identifiers + * (as parameters), check if the use of the corresponding private tile + * requires unrolling. + * + * If we are creating a private tile because we are forced to, + * then no unrolling is required. + * Otherwise we check if "access" is bijective and unrolling + * is required if it is not. Note that the access relation + * has already been determined to be bijective before the introduction + * of the thread identifiers and the removal of the schedule dimensions + * that are mapped to these threads. If the access relation is no longer + * bijective, then this means that more than one value of one of those + * schedule dimensions is mapped to the same thread and therefore + * unrolling is required. + */ +static int check_requires_unroll(struct gpu_group_data *data, + __isl_keep isl_map *access, int force_private) +{ + int bijective; + + if (force_private) + return 0; + bijective = access_is_bijective(data, access); + if (bijective < 0) + return -1; + return !bijective; +} + +/* Map the domain of "access" to the outer data->shared_depth + * schedule dimensions. When data->shared_depth is equal to + * data->thread_depth, this result is already available in group->access. + */ +static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group, + __isl_keep isl_union_map *access, struct gpu_group_data *data) +{ + isl_union_map *shared; + + if (data->shared_depth == data->thread_depth) + return isl_map_copy(group->access); + + shared = isl_union_map_copy(access); + shared = isl_union_map_apply_domain(shared, + isl_union_map_copy(data->shared_sched)); + return isl_map_from_union_map(shared); +} + +/* Compute the private and/or shared memory tiles for the array + * reference group "group" of array "array". + * Return 0 on success and -1 on error. + * + * If the array is a read-only scalar or if the user requested + * not to use shared or private memory, then we do not need to do anything. + * + * If any reference in the reference group accesses more than one element, + * then we would have to make sure that the layout in shared memory + * is the same as that in global memory. Since we do not handle this yet + * (and it may not even be possible), we refuse to map to private or + * shared memory in such cases. + * + * If the array group involves any may writes (that are not must writes), + * then we would have to make sure that we load the data into shared/private + * memory first in case the data is not written by the kernel + * (but still written back out to global memory). + * Since we don't have any such mechanism at the moment, we don't + * compute shared/private tiles for groups involving may writes. + * + * We only try to compute a shared memory tile if there is any reuse + * or if the access is not coalesced. + * Reuse and coalescing are checked within the given kernel. + * + * For computing a private memory tile, we also require that there is + * some reuse. Moreover, we require that the access is private + * to the thread. That is, we check that any given array element + * is only accessed by a single thread. + * We compute an access relation that maps the outer + * data->thread_depth + data->n_thread schedule dimensions. + * The latter data->n_thread will be mapped to thread identifiers. + * We actually check that those iterators that will be wrapped + * partition the array space. This check is stricter than necessary + * since several iterations may be mapped onto the same thread + * and then they could be allowed to access the same memory elements, + * but our check does not allow this situation. + * + * For private memory tiles, the number of schedule dimensions that + * affect the offset is computed and stored in tile->depth, with + * a lower bound of data->kernel_depth. If this depth is smaller + * than the minimal depth that still ensures that every element + * is accessed by a single thread, then the depth is raised + * to this minimal depth. + * The fields of the tile are then adjusted to only refer to the tile->depth + * outer schedule dimensions. + * + * We also check that the index expression only depends on parallel + * loops. That way, we can move those loops innermost and unroll them. + * Again, we use a test that is stricter than necessary. + * We actually check whether the index expression only depends + * on the iterators that are wrapped over the threads. + * These are necessarily parallel, but there may be more parallel loops. + * + * Combining the injectivity of the first test with the single-valuedness + * of the second test, we simply test for bijectivity. + * + * If the use of the private tile requires unrolling, but some + * of the other arrays are forcibly mapped to private memory, + * then we do not allow the use of this private tile since + * we cannot move the schedule dimensions that need to be unrolled down + * without performing some kind of expansion on those arrays + * that are forcibly mapped to private memory. + * + * If the array is marked force_private, then we bypass all checks + * and assume we can (and should) use registers only. + * + * If it turns out we can (or have to) use registers, we compute + * the private memory tile size using can_tile, after introducing a dependence + * on the thread indices. + */ +static int compute_group_bounds_core(struct ppcg_kernel *kernel, + struct gpu_array_ref_group *group, struct gpu_group_data *data) +{ + isl_ctx *ctx = isl_space_get_ctx(group->array->space); + isl_union_map *access, *local; + int n_index = group->array->n_index; + int no_reuse, coalesced; + isl_map *acc; + int force_private = group->local_array->force_private; + int use_shared = !force_private && kernel->options->use_shared_memory && + data->n_thread > 0; + int use_private = force_private || kernel->options->use_private_memory; + int r = 0; + int requires_unroll; + int unique_depth; + + if (!use_shared && !use_private) + return 0; + if (gpu_array_is_read_only_scalar(group->array)) + return 0; + if (!force_private && !group->exact_write) + return 0; + if (group->slice) + return 0; + + access = gpu_array_ref_group_access_relation(group, 1, 1); + local = localize_access(data, isl_union_map_copy(access)); + no_reuse = isl_union_map_is_injective(local); + if (no_reuse < 0) + r = -1; + if (use_shared && no_reuse) + coalesced = access_is_coalesced(data, local); + isl_union_map_free(local); + + if (r >= 0 && kernel->options->debug->verbose && + use_shared && no_reuse && coalesced) + report_no_reuse_and_coalesced(kernel, access); + + if (use_shared && (!no_reuse || !coalesced)) { + group->shared_tile = gpu_array_tile_create(ctx, + group->array->n_index); + acc = shared_access(group, access, data); + if (!group->shared_tile) + r = -1; + else if (!can_tile(acc, group->shared_tile)) + group->shared_tile = + gpu_array_tile_free(group->shared_tile); + isl_map_free(acc); + } + + if (r < 0 || (!force_private && (!use_private || no_reuse))) { + isl_union_map_free(access); + return r; + } + + access = isl_union_map_apply_domain(access, + isl_union_map_copy(data->thread_sched)); + + acc = isl_map_from_union_map(access); + + if (!force_private && !access_is_bijective(data, acc)) { + isl_map_free(acc); + return 0; + } + + unique_depth = compute_accessed_by_single_thread_depth(data, acc); + + acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization)); + acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth, + data->n_thread); + requires_unroll = check_requires_unroll(data, acc, force_private); + if (unique_depth < 0 || requires_unroll < 0 || + (requires_unroll && kernel->any_force_private)) { + isl_map_free(acc); + return requires_unroll < 0 ? -1 : 0; + } + + group->private_tile = gpu_array_tile_create(ctx, n_index); + if (!group->private_tile) { + isl_map_free(acc); + return -1; + } + group->private_tile->requires_unroll = requires_unroll; + if (!can_tile(acc, group->private_tile)) + group->private_tile = gpu_array_tile_free(group->private_tile); + + isl_map_free(acc); + + if (group->private_tile) { + struct gpu_array_tile *tile = group->private_tile; + int tile_depth = compute_tile_depth(data, tile); + if (tile_depth < unique_depth) + tile_depth = unique_depth; + if (tile_adjust_depth(tile, tile_depth) < 0) + return -1; + } + + if (force_private && !group->private_tile) + isl_die(ctx, isl_error_internal, + "unable to map array reference group to registers", + return -1); + + return 0; +} + +/* Compute the private and/or shared memory tiles for the array + * reference group "group" of array "array" and set the tile depth. + * Return 0 on success and -1 on error. + */ +static int compute_group_bounds(struct ppcg_kernel *kernel, + struct gpu_array_ref_group *group, struct gpu_group_data *data) +{ + if (!group) + return -1; + if (compute_group_bounds_core(kernel, group, data) < 0) + return -1; + if (set_depth(data, group) < 0) + return -1; + + return 0; +} + +/* If two groups have overlapping access relations (as determined by + * the "overlap" function) and if one of them involves a write, + * then merge the two groups into one. + * If "compute_bounds" is set, then call compute_group_bounds + * on the merged groups. + * + * Return the updated number of groups. + * Return -1 on error. + */ +static int group_writes(struct ppcg_kernel *kernel, + int n, struct gpu_array_ref_group **groups, + int (*overlap)(struct gpu_array_ref_group *group1, + struct gpu_array_ref_group *group2), int compute_bounds, + struct gpu_group_data *data) +{ + int i, j; + + for (i = 0; i < n; ++i) { + for (j = n - 1; j > i; --j) { + if (!groups[i]->write && !groups[j]->write) + continue; + + if (!overlap(groups[i], groups[j])) + continue; + + groups[i] = join_groups_and_free(groups[i], groups[j]); + if (j != n - 1) + groups[j] = groups[n - 1]; + groups[n - 1] = NULL; + n--; + + if (!groups[i]) + return -1; + if (compute_bounds && + compute_group_bounds(kernel, groups[i], data) < 0) + return -1; + } + } + + return n; +} + +/* If two groups have overlapping access relations (within the innermost + * loop) and if one of them involves a write, then merge the two groups + * into one. + * + * Return the updated number of groups. + */ +static int group_overlapping_writes(struct ppcg_kernel *kernel, + int n, struct gpu_array_ref_group **groups, + struct gpu_group_data *data) +{ + return group_writes(kernel, n, groups, &accesses_overlap, 0, data); +} + +/* Check if the access relations of group1 and group2 overlap within + * the outermost min(group1->min_depth, group2->min_depth) loops. + */ +static int depth_accesses_overlap(struct gpu_array_ref_group *group1, + struct gpu_array_ref_group *group2) +{ + int depth; + int dim; + int empty; + isl_map *map_i, *map_j, *map; + + depth = group1->min_depth; + if (group2->min_depth < depth) + depth = group2->min_depth; + map_i = isl_map_copy(group1->access); + dim = isl_map_dim(map_i, isl_dim_in); + map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth); + map_j = isl_map_copy(group2->access); + map_j = isl_map_eliminate(map_j, isl_dim_in, depth, dim - depth); + map = isl_map_intersect(map_i, map_j); + empty = isl_map_is_empty(map); + isl_map_free(map); + + return !empty; +} + +/* If two groups have overlapping access relations (within the outer + * depth loops) and if one of them involves a write, + * then merge the two groups into one. + * + * Return the updated number of groups. + */ +static int group_depth_overlapping_writes(struct ppcg_kernel *kernel, + int n, struct gpu_array_ref_group **groups, struct gpu_group_data *data) +{ + return group_writes(kernel, n, groups, &depth_accesses_overlap, 1, + data); +} + +/* Is the size of the tile specified by "tile" smaller than the sum of + * the sizes of the tiles specified by "tile1" and "tile2"? + */ +static int smaller_tile(struct gpu_array_tile *tile, + struct gpu_array_tile *tile1, struct gpu_array_tile *tile2) +{ + int smaller; + isl_val *size, *size1, *size2; + + size = gpu_array_tile_size(tile); + size1 = gpu_array_tile_size(tile1); + size2 = gpu_array_tile_size(tile2); + + size = isl_val_sub(size, size1); + size = isl_val_sub(size, size2); + smaller = isl_val_is_neg(size); + + isl_val_free(size); + + return smaller; +} + +/* Given an initial grouping of array references and shared memory tiles + * for each group that allows for a shared memory tile, merge two groups + * if both have a shared memory tile, the merged group also has + * a shared memory tile and the size of the tile for the merge group + * is smaller than the sum of the tile sizes of the individual groups. + * + * If merging two groups decreases the depth of the tile of + * one or both of the two groups, then we need to check for overlapping + * writes again. + * + * Return the number of groups after merging. + * Return -1 on error. + */ +static int group_common_shared_memory_tile(struct ppcg_kernel *kernel, + struct gpu_array_info *array, int n, + struct gpu_array_ref_group **groups, struct gpu_group_data *data) +{ + int i, j; + int recompute_overlap = 0; + + for (i = 0; i < n; ++i) { + if (!groups[i]->shared_tile) + continue; + for (j = n - 1; j > i; --j) { + struct gpu_array_ref_group *group; + + if (!groups[j]->shared_tile) + continue; + + if (!depth_accesses_overlap(groups[i], groups[j])) + continue; + + group = join_groups(groups[i], groups[j]); + if (compute_group_bounds(kernel, group, data) < 0) { + gpu_array_ref_group_free(group); + return -1; + } + if (!group->shared_tile || + !smaller_tile(group->shared_tile, + groups[i]->shared_tile, + groups[j]->shared_tile)) { + gpu_array_ref_group_free(group); + continue; + } + + if (group->min_depth < groups[i]->min_depth || + group->min_depth < groups[j]->min_depth) + recompute_overlap = 1; + gpu_array_ref_group_free(groups[i]); + gpu_array_ref_group_free(groups[j]); + groups[i] = group; + if (j != n - 1) + groups[j] = groups[n - 1]; + n--; + } + } + + if (recompute_overlap) + n = group_depth_overlapping_writes(kernel, n, groups, data); + return n; +} + +/* Set array->n_group and array->groups to n and groups. + * + * Additionally, set the "nr" field of each group. + */ +static void set_array_groups(struct gpu_local_array_info *array, + int n, struct gpu_array_ref_group **groups) +{ + int i; + + array->n_group = n; + array->groups = groups; + + for (i = 0; i < n; ++i) + groups[i]->nr = i; +} + +/* Combine all groups in "groups" into a single group and return + * the new number of groups (1 or 0 if there were no groups to start with). + */ +static int join_all_groups(int n, struct gpu_array_ref_group **groups) +{ + int i; + + for (i = n - 1; i > 0; --i) { + groups[0] = join_groups_and_free(groups[0], groups[i]); + groups[i] = NULL; + n--; + } + + return n; +} + +/* Group array references that should be considered together when + * deciding whether to access them from private, shared or global memory. + * Return -1 on error. + * + * In particular, if two array references overlap and if one of them + * is a write, then the two references are grouped together. + * We first perform an initial grouping based only on the access relation. + * After computing shared and private memory tiles, we check for + * overlapping writes again, but this time taking into account + * the depth of the effective tile. + * + * Furthermore, if two groups admit a shared memory tile and if the + * combination of the two also admits a shared memory tile, we merge + * the two groups. + * + * If the array contains structures, then we compute a single + * reference group without trying to find any tiles + * since we do not map such arrays to private or shared + * memory. The only exception is when those arrays of structures + * are required to be mapped to private memory. + */ +static int group_array_references(struct ppcg_kernel *kernel, + struct gpu_local_array_info *local, struct gpu_group_data *data) +{ + int i; + int n; + isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched); + struct gpu_array_ref_group **groups; + + groups = isl_calloc_array(ctx, struct gpu_array_ref_group *, + local->array->n_ref); + if (!groups) + return -1; + + n = populate_array_references(local, groups, data); + + if (local->array->has_compound_element && !local->force_private) { + n = join_all_groups(n, groups); + set_array_groups(local, n, groups); + return 0; + } + + n = group_overlapping_writes(kernel, n, groups, data); + + for (i = 0; i < n; ++i) + if (compute_group_bounds(kernel, groups[i], data) < 0) + n = -1; + + n = group_depth_overlapping_writes(kernel, n, groups, data); + + n = group_common_shared_memory_tile(kernel, local->array, + n, groups, data); + + set_array_groups(local, n, groups); + + if (n >= 0) + return 0; + + for (i = 0; i < local->array->n_ref; ++i) + gpu_array_ref_group_free(groups[i]); + return -1; +} + +/* For each array in the input program that can be mapped to private memory, + * check if there are any order dependences active inside the current kernel, + * within the same iteration of the host schedule, i.e., the prefix + * schedule at "node". + * If so, mark the array as force_private so that its reference groups will be + * mapped to a registers. + * + * Note that the arrays that cannot be mapped to private memory have + * had their order dependences added to prog->array_order and + * subsequently to the coincidence constraints. + */ +static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel, + __isl_keep isl_schedule_node *node) +{ + int i; + isl_union_set *domain; + isl_multi_union_pw_aff *prefix; + isl_union_pw_multi_aff *contraction; + + if (!kernel->options->live_range_reordering) + return; + + kernel->any_force_private = 0; + + prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix, + contraction); + domain = isl_union_set_copy(kernel->expanded_domain); + domain = isl_union_set_universe(domain); + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *local = &kernel->array[i]; + isl_union_map *order; + + local->force_private = 0; + if (!gpu_array_can_be_private(local->array)) + continue; + order = isl_union_map_copy(local->array->dep_order); + order = isl_union_map_intersect_domain(order, + isl_union_set_copy(domain)); + order = isl_union_map_intersect_range(order, + isl_union_set_copy(domain)); + order = isl_union_map_eq_at_multi_union_pw_aff(order, + isl_multi_union_pw_aff_copy(prefix)); + if (!isl_union_map_is_empty(order)) { + local->force_private = 1; + kernel->any_force_private = 1; + } + isl_union_map_free(order); + } + + isl_multi_union_pw_aff_free(prefix); + isl_union_set_free(domain); +} + +/* Expand the domain of the schedule "s" by plugging in + * the contraction "contraction" and return the result. + */ +static __isl_give isl_union_map *expand(__isl_take isl_union_map *s, + __isl_keep isl_union_pw_multi_aff *contraction) +{ + contraction = isl_union_pw_multi_aff_copy(contraction); + s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction); + return s; +} + +/* Create a set of dimension data->thread_depth + data->n_thread + * that equates the residue of the final data->n_thread dimensions + * modulo the kernel->block_dim sizes to the thread identifiers. + * Store the computed set in data->privatization. + * + * The construction starts with the space of kernel->thread_filter, + * which is known to reference all thread identifiers. + */ +static void compute_privatization(struct gpu_group_data *data, + struct ppcg_kernel *kernel) +{ + int i; + isl_ctx *ctx; + isl_space *space; + isl_local_space *ls; + isl_set *set; + + ctx = isl_union_map_get_ctx(data->shared_sched); + space = isl_union_set_get_space(kernel->thread_filter); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, + data->thread_depth + data->n_thread); + set = isl_set_universe(space); + space = isl_set_get_space(set); + ls = isl_local_space_from_space(space); + + for (i = 0; i < data->n_thread; ++i) { + isl_aff *aff, *aff2; + isl_constraint *c; + isl_val *v; + isl_id *id; + int pos; + + aff = isl_aff_var_on_domain(isl_local_space_copy(ls), + isl_dim_set, data->thread_depth + i); + v = isl_val_int_from_si(ctx, kernel->block_dim[i]); + aff = isl_aff_mod_val(aff, v); + id = isl_id_list_get_id(kernel->thread_ids, i); + pos = isl_set_find_dim_by_id(set, isl_dim_param, id); + isl_id_free(id); + aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls), + isl_dim_param, pos); + aff = isl_aff_sub(aff, aff2); + c = isl_equality_from_aff(aff); + set = isl_set_add_constraint(set, c); + } + + isl_local_space_free(ls); + data->privatization = set; +} + +/* Return the prefix schedule at "node" as a relation + * between domain elements and schedule dimensions after detecting + * equalities in this relation. + */ +static __isl_give isl_union_map *prefix_with_equalities( + __isl_keep isl_schedule_node *node) +{ + isl_union_map *schedule; + + schedule = isl_schedule_node_get_prefix_schedule_relation(node); + schedule = isl_union_map_detect_equalities(schedule); + + return schedule; +} + +/* Group references of all arrays in "kernel". + * "node" points to the kernel mark. + * The mapping to shared memory in computed at the "shared" mark. + * + * We first extract all required schedule information into + * a gpu_group_data structure and then consider each array + * in turn. + */ +int gpu_group_references(struct ppcg_kernel *kernel, + __isl_keep isl_schedule_node *node) +{ + int i; + int r = 0; + isl_union_pw_multi_aff *contraction; + struct gpu_group_data data; + + check_can_be_private_live_ranges(kernel, node); + + data.scop = kernel->prog->scop; + + data.kernel_depth = isl_schedule_node_get_schedule_depth(node); + data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node); + + node = isl_schedule_node_copy(node); + node = gpu_tree_move_down_to_shared(node, kernel->core); + data.shared_depth = isl_schedule_node_get_schedule_depth(node); + data.shared_sched = prefix_with_equalities(node); + + node = gpu_tree_move_down_to_thread(node, kernel->core); + node = isl_schedule_node_child(node, 0); + data.thread_depth = isl_schedule_node_get_schedule_depth(node); + data.n_thread = isl_schedule_node_band_n_member(node); + if (data.thread_depth == data.shared_depth) + data.copy_sched = isl_union_map_copy(data.shared_sched); + else + data.copy_sched = prefix_with_equalities(node); + data.thread_sched = isl_union_map_copy(data.copy_sched); + data.thread_sched = isl_union_map_flat_range_product(data.thread_sched, + isl_schedule_node_band_get_partial_schedule_union_map(node)); + data.thread_sched = isl_union_map_detect_equalities(data.thread_sched); + + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + data.host_sched = expand(data.host_sched, contraction); + data.shared_sched = expand(data.shared_sched, contraction); + if (data.thread_depth == data.shared_depth) { + isl_union_map_free(data.copy_sched); + data.copy_sched = isl_union_map_copy(data.shared_sched); + } else { + data.copy_sched = expand(data.copy_sched, contraction); + } + data.thread_sched = expand(data.thread_sched, contraction); + isl_union_pw_multi_aff_free(contraction); + + node = isl_schedule_node_child(node, 0); + data.full_sched = isl_union_map_copy(data.thread_sched); + data.full_sched = isl_union_map_flat_range_product(data.full_sched, + isl_schedule_node_get_subtree_schedule_union_map(node)); + isl_schedule_node_free(node); + + compute_privatization(&data, kernel); + + for (i = 0; i < kernel->n_array; ++i) { + r = group_array_references(kernel, &kernel->array[i], &data); + if (r < 0) + break; + } + + isl_union_map_free(data.host_sched); + isl_union_map_free(data.shared_sched); + isl_union_map_free(data.copy_sched); + isl_union_map_free(data.thread_sched); + isl_union_map_free(data.full_sched); + isl_set_free(data.privatization); + + return r; +} + +/* Given a description of an array tile "tile" and the "space" + * + * { D -> A } + * + * where D represents the first tile->depth schedule dimensions + * and A represents the array, construct an isl_multi_aff + * + * { [D[i] -> A[a]] -> A'[a'] } + * + * with A' a scaled down copy of A according to the shifts and strides + * in "tile". In particular, + * + * a' = (a + shift(i))/stride + * + * "insert_array" represents + * + * { [D -> A] -> D } + * + * and is used to insert A into the domain of functions that only + * reference D. + */ +static __isl_give isl_multi_aff *strided_tile( + struct gpu_array_tile *tile, __isl_keep isl_space *space, + __isl_keep isl_multi_aff *insert_array) +{ + int i; + isl_ctx *ctx; + isl_multi_aff *shift; + isl_multi_val *stride; + isl_space *space2; + isl_local_space *ls; + isl_multi_aff *tiling; + + ctx = isl_space_get_ctx(space); + space2 = isl_space_domain(isl_space_copy(space)); + ls = isl_local_space_from_space(space2); + space2 = isl_space_range(isl_space_copy(space)); + stride = isl_multi_val_zero(space2); + shift = isl_multi_aff_zero(isl_space_copy(space)); + + for (i = 0; i < tile->n; ++i) { + struct gpu_array_bound *bound = &tile->bound[i]; + isl_val *stride_i; + isl_aff *shift_i; + + if (tile->bound[i].shift) { + stride_i = isl_val_copy(bound->stride); + shift_i = isl_aff_copy(bound->shift); + } else { + stride_i = isl_val_one(ctx); + shift_i = isl_aff_zero_on_domain( + isl_local_space_copy(ls)); + } + + stride = isl_multi_val_set_val(stride, i, stride_i); + shift = isl_multi_aff_set_aff(shift, i, shift_i); + } + isl_local_space_free(ls); + + shift = isl_multi_aff_pullback_multi_aff(shift, + isl_multi_aff_copy(insert_array)); + + tiling = isl_multi_aff_range_map(isl_space_copy(space)); + tiling = isl_multi_aff_add(tiling, shift); + tiling = isl_multi_aff_scale_down_multi_val(tiling, stride); + + return tiling; +} + +/* Compute a tiling for the array reference group "group". + * + * The tiling is of the form + * + * { [D[i] -> A[a]] -> T[t] } + * + * where D represents the first tile->depth schedule dimensions, + * A represents the global array and T represents the shared or + * private memory tile. The name of T is the name of the local + * array. + * + * If there is any stride in the accesses, then the mapping is + * + * t = (a + shift(i))/stride - lb(i) + * + * otherwise, it is simply + * + * t = a - lb(i) + */ +void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group) +{ + int i; + struct gpu_array_tile *tile; + isl_space *space; + isl_multi_aff *tiling, *lb, *insert_array; + isl_printer *p; + char *local_name; + + tile = gpu_array_ref_group_tile(group); + if (!tile) + return; + + space = isl_map_get_space(group->access); + space = isl_space_from_range(isl_space_range(space)); + space = isl_space_add_dims(space, isl_dim_in, tile->depth); + insert_array = isl_multi_aff_domain_map(isl_space_copy(space)); + + for (i = 0; i < tile->n; ++i) + if (tile->bound[i].shift) + break; + + if (i < tile->n) + tiling = strided_tile(tile, space, insert_array); + else + tiling = isl_multi_aff_range_map(isl_space_copy(space)); + + lb = isl_multi_aff_zero(space); + for (i = 0; i < tile->n; ++i) { + isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb); + lb = isl_multi_aff_set_aff(lb, i, lb_i); + } + lb = isl_multi_aff_pullback_multi_aff(lb, insert_array); + + tiling = isl_multi_aff_sub(tiling, lb); + + p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling)); + p = gpu_array_ref_group_print_name(group, p); + local_name = isl_printer_get_str(p); + isl_printer_free(p); + tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name); + free(local_name); + + tile->tiling = tiling; +} diff --git a/polly/lib/External/ppcg/gpu_hybrid.h b/polly/lib/External/ppcg/gpu_hybrid.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_hybrid.h @@ -0,0 +1,13 @@ +#ifndef GPU_HYBRID_H +#define GPU_HYBRID_H + +#include + +#include "gpu.h" +#include "hybrid.h" + +__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen, + __isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds, + int *tile_sizes); + +#endif diff --git a/polly/lib/External/ppcg/gpu_hybrid.c b/polly/lib/External/ppcg/gpu_hybrid.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_hybrid.c @@ -0,0 +1,146 @@ +/* + * Copyright 2013 Ecole Normale Superieure + * Copyright 2015 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include + +#include +#include +#include +#include + +#include "hybrid.h" +#include "gpu_hybrid.h" +#include "gpu_tree.h" +#include "schedule.h" +#include "util.h" + +/* Have all domain elements been filtered out before reaching + * the "node" position in the schedule tree? + */ +static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node) +{ + isl_union_set *domain; + isl_bool empty; + + domain = isl_schedule_node_get_domain(node); + empty = isl_union_set_is_empty(domain); + isl_union_set_free(domain); + + return empty; +} + +/* Given a pointer to a phase in the result of hybrid tiling, + * map the phase to the device, provided the phase is non-empty. + * Empty phases can occur if the input schedule domain can be + * covered by a small number of hexagons that all belong to the same phase. + * + * The input has the following form: + * + * M - CT - P - C - ... + * + * with M the phase marker, CT the space tiling, P the original + * parent band and C the original child band. + * The (outer dimensions of the) C band need to be mapped to threads. + * The (outer dimension of the) CT band needs to be mapped to blocks. + * The mapping to shared memory needs to be computed between the CT and + * the P band. + * + * The C band is first shifted to start at zero. + * Then the appropriate markers are introduced and a kernel is + * created for the tree rooted at CT. + * If the "unroll_gpu_tile" option is set, then the AST generator + * is instructed to unroll the P and C bands. + */ +static __isl_give isl_schedule_node *update_phase( + __isl_take isl_schedule_node *node, void *user) +{ + struct gpu_gen *gen = user; + int depth0, depth; + isl_ctx *ctx; + isl_id *id; + isl_bool empty_domain; + ppcg_ht_phase *phase; + + empty_domain = has_empty_domain(node); + if (empty_domain < 0) + return isl_schedule_node_free(node); + if (empty_domain) + return node; + + if (!node) + return NULL; + ctx = isl_schedule_node_get_ctx(node); + + phase = ppcg_ht_phase_extract_from_mark(node); + + depth0 = isl_schedule_node_get_tree_depth(node); + + node = isl_schedule_node_child(node, 0); + + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + node = ppcg_ht_phase_shift_space_point(phase, node); + if (gen->options->unroll_gpu_tile) + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); + id = isl_id_alloc(ctx, "thread", NULL); + node = isl_schedule_node_insert_mark(node, id); + node = isl_schedule_node_parent(node); + if (gen->options->unroll_gpu_tile) + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); + id = isl_id_alloc(ctx, "shared", NULL); + node = isl_schedule_node_insert_mark(node, id); + node = isl_schedule_node_parent(node); + + node = gpu_create_kernel(gen, node, 0, NULL); + + depth = isl_schedule_node_get_tree_depth(node); + node = isl_schedule_node_ancestor(node, depth - depth0); + + return node; +} + +/* Apply hybrid tiling on "node" and its parent based on the (valid) + * bounds on the relative dependence distances "bounds" and + * the tile sizes in "tile_sizes". + * The number of elements in "tile_sizes" is at least as large + * as the sum of the dimensions of the parent and the child node. + * + * Convert the tile_sizes to an isl_multi_val in the right space, + * insert the hybrid tiling and then create a kernel inside each phase. + * Finally, remove the phase marks. + */ +__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen, + __isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds, + int *tile_sizes) +{ + isl_multi_val *mv; + isl_space *space, *space2; + + if (!node || !bounds) + goto error; + + space2 = isl_schedule_node_band_get_space(node); + node = isl_schedule_node_parent(node); + space = isl_schedule_node_band_get_space(node); + space = isl_space_product(space, space2); + mv = ppcg_multi_val_from_int_list(space, tile_sizes); + + node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options); + + node = hybrid_tile_foreach_phase(node, &update_phase, gen); + + node = hybrid_tile_drop_phase_marks(node); + + return node; +error: + isl_schedule_node_free(node); + ppcg_ht_bounds_free(bounds); + return NULL; +} diff --git a/polly/lib/External/ppcg/gpu_print.h b/polly/lib/External/ppcg/gpu_print.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_print.h @@ -0,0 +1,28 @@ +#ifndef GPU_PRINT_H +#define GPU_PRINT_H + +#include "gpu.h" + +__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p, + struct gpu_prog *prog); + +__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p, + struct gpu_types *types, struct gpu_prog *prog); + +__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node); + +__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn, + struct gpu_array_info *array); +__isl_give isl_printer *gpu_array_info_print_declaration_argument( + __isl_take isl_printer *p, struct gpu_array_info *array, + const char *memory_space); +__isl_give isl_printer *gpu_array_info_print_call_argument( + __isl_take isl_printer *p, struct gpu_array_info *array); + +__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p, + struct ppcg_kernel_stmt *stmt); +__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p, + struct ppcg_kernel_stmt *stmt); + +#endif diff --git a/polly/lib/External/ppcg/gpu_print.c b/polly/lib/External/ppcg/gpu_print.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_print.c @@ -0,0 +1,310 @@ +/* + * Copyright 2012 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France + */ + +#include + +#include + +#include "gpu_print.h" +#include "print.h" +#include "schedule.h" + +/* Print declarations to "p" for arrays that are local to "prog" + * but that are used on the host and therefore require a declaration. + */ +__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + int i; + + if (!prog) + return isl_printer_free(p); + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + isl_ast_expr *size; + + if (!array->declare_local) + continue; + size = array->declared_size; + p = ppcg_print_declaration_with_size(p, array->type, size); + } + + return p; +} + +/* Print an expression for the size of "array" in bytes. + */ +__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn, + struct gpu_array_info *array) +{ + int i; + + for (i = 0; i < array->n_index; ++i) { + isl_ast_expr *bound; + + prn = isl_printer_print_str(prn, "("); + bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i); + prn = isl_printer_print_ast_expr(prn, bound); + isl_ast_expr_free(bound); + prn = isl_printer_print_str(prn, ") * "); + } + prn = isl_printer_print_str(prn, "sizeof("); + prn = isl_printer_print_str(prn, array->type); + prn = isl_printer_print_str(prn, ")"); + + return prn; +} + +/* Print the declaration of a non-linearized array argument. + */ +static __isl_give isl_printer *print_non_linearized_declaration_argument( + __isl_take isl_printer *p, struct gpu_array_info *array) +{ + p = isl_printer_print_str(p, array->type); + p = isl_printer_print_str(p, " "); + + p = isl_printer_print_ast_expr(p, array->bound_expr); + + return p; +} + +/* Print the declaration of an array argument. + * "memory_space" allows to specify a memory space prefix. + */ +__isl_give isl_printer *gpu_array_info_print_declaration_argument( + __isl_take isl_printer *p, struct gpu_array_info *array, + const char *memory_space) +{ + if (gpu_array_is_read_only_scalar(array)) { + p = isl_printer_print_str(p, array->type); + p = isl_printer_print_str(p, " "); + p = isl_printer_print_str(p, array->name); + return p; + } + + if (memory_space) { + p = isl_printer_print_str(p, memory_space); + p = isl_printer_print_str(p, " "); + } + + if (array->n_index != 0 && !array->linearize) + return print_non_linearized_declaration_argument(p, array); + + p = isl_printer_print_str(p, array->type); + p = isl_printer_print_str(p, " "); + p = isl_printer_print_str(p, "*"); + p = isl_printer_print_str(p, array->name); + + return p; +} + +/* Print the call of an array argument. + */ +__isl_give isl_printer *gpu_array_info_print_call_argument( + __isl_take isl_printer *p, struct gpu_array_info *array) +{ + if (gpu_array_is_read_only_scalar(array)) + return isl_printer_print_str(p, array->name); + + p = isl_printer_print_str(p, "dev_"); + p = isl_printer_print_str(p, array->name); + + return p; +} + +/* Print an access to the element in the private/shared memory copy + * described by "stmt". The index of the copy is recorded in + * stmt->local_index as an access to the array. + */ +static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p, + struct ppcg_kernel_stmt *stmt) +{ + return isl_printer_print_ast_expr(p, stmt->u.c.local_index); +} + +/* Print an access to the element in the global memory copy + * described by "stmt". The index of the copy is recorded in + * stmt->index as an access to the array. + */ +static __isl_give isl_printer *stmt_print_global_index( + __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt) +{ + struct gpu_array_info *array = stmt->u.c.array; + isl_ast_expr *index; + + if (gpu_array_is_scalar(array)) { + if (!gpu_array_is_read_only_scalar(array)) + p = isl_printer_print_str(p, "*"); + p = isl_printer_print_str(p, array->name); + return p; + } + + index = isl_ast_expr_copy(stmt->u.c.index); + + p = isl_printer_print_ast_expr(p, index); + isl_ast_expr_free(index); + + return p; +} + +/* Print a copy statement. + * + * A read copy statement is printed as + * + * local = global; + * + * while a write copy statement is printed as + * + * global = local; + */ +__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p, + struct ppcg_kernel_stmt *stmt) +{ + p = isl_printer_start_line(p); + if (stmt->u.c.read) { + p = stmt_print_local_index(p, stmt); + p = isl_printer_print_str(p, " = "); + p = stmt_print_global_index(p, stmt); + } else { + p = stmt_print_global_index(p, stmt); + p = isl_printer_print_str(p, " = "); + p = stmt_print_local_index(p, stmt); + } + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p, + struct ppcg_kernel_stmt *stmt) +{ + return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr); +} + +/* This function is called for each node in a GPU AST. + * In case of a user node, print the macro definitions required + * for printing the AST expressions in the annotation, if any. + * For other nodes, return true such that descendants are also + * visited. + * + * In particular, for a kernel launch, print the macro definitions + * needed for the grid size. + * For a copy statement, print the macro definitions needed + * for the two index expressions. + * For an original user statement, print the macro definitions + * needed for the substitutions. + */ +static isl_bool at_node(__isl_keep isl_ast_node *node, void *user) +{ + const char *name; + isl_id *id; + int is_kernel; + struct ppcg_kernel *kernel; + struct ppcg_kernel_stmt *stmt; + isl_printer **p = user; + + if (isl_ast_node_get_type(node) != isl_ast_node_user) + return isl_bool_true; + + id = isl_ast_node_get_annotation(node); + if (!id) + return isl_bool_false; + + name = isl_id_get_name(id); + if (!name) + return isl_bool_error; + is_kernel = !strcmp(name, "kernel"); + kernel = is_kernel ? isl_id_get_user(id) : NULL; + stmt = is_kernel ? NULL : isl_id_get_user(id); + isl_id_free(id); + + if ((is_kernel && !kernel) || (!is_kernel && !stmt)) + return isl_bool_error; + + if (is_kernel) { + *p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p); + } else if (stmt->type == ppcg_kernel_copy) { + *p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p); + *p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p); + } else if (stmt->type == ppcg_kernel_domain) { + *p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr); + } + if (!*p) + return isl_bool_error; + + return isl_bool_false; +} + +/* Print the required macros for the GPU AST "node" to "p", + * including those needed for the user statements inside the AST. + */ +__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node) +{ + if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0) + return isl_printer_free(p); + p = ppcg_print_macros(p, node); + return p; +} + +/* Was the definition of "type" printed before? + * That is, does its name appear in the list of printed types "types"? + */ +static int already_printed(struct gpu_types *types, + struct pet_type *type) +{ + int i; + + for (i = 0; i < types->n; ++i) + if (!strcmp(types->name[i], type->name)) + return 1; + + return 0; +} + +/* Print the definitions of all types prog->scop that have not been + * printed before (according to "types") on "p". + * Extend the list of printed types "types" with the newly printed types. + */ +__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p, + struct gpu_types *types, struct gpu_prog *prog) +{ + int i, n; + isl_ctx *ctx; + char **name; + + n = prog->scop->pet->n_type; + + if (n == 0) + return p; + + ctx = isl_printer_get_ctx(p); + name = isl_realloc_array(ctx, types->name, char *, types->n + n); + if (!name) + return isl_printer_free(p); + types->name = name; + + for (i = 0; i < n; ++i) { + struct pet_type *type = prog->scop->pet->types[i]; + + if (already_printed(types, type)) + continue; + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, type->definition); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + types->name[types->n++] = strdup(type->name); + } + + return p; +} diff --git a/polly/lib/External/ppcg/gpu_tree.h b/polly/lib/External/ppcg/gpu_tree.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_tree.h @@ -0,0 +1,33 @@ +#ifndef GPU_TREE_H +#define GPU_TREE_H + +#include + +#include "gpu.h" + +__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread( + __isl_take isl_schedule_node *node); +int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node); +__isl_give isl_schedule_node *gpu_tree_move_down_to_shared( + __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core); +__isl_give isl_schedule_node *gpu_tree_move_up_to_thread( + __isl_take isl_schedule_node *node); +__isl_give isl_schedule_node *gpu_tree_move_down_to_thread( + __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core); +__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel( + __isl_take isl_schedule_node *node); +__isl_give isl_schedule_node *gpu_tree_move_down_to_depth( + __isl_take isl_schedule_node *node, int depth, + __isl_keep isl_union_set *core); + +int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel); +__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); +__isl_give isl_schedule_node *gpu_tree_ensure_following_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); +__isl_give isl_schedule_node *gpu_tree_move_left_to_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); +__isl_give isl_schedule_node *gpu_tree_move_right_to_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); + +#endif diff --git a/polly/lib/External/ppcg/gpu_tree.c b/polly/lib/External/ppcg/gpu_tree.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/gpu_tree.c @@ -0,0 +1,640 @@ +/* + * Copyright 2013 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include + +#include +#include +#include + +#include "gpu_tree.h" + +/* The functions in this file are used to navigate part of a schedule tree + * that is mapped to blocks. Initially, this part consists of a linear + * branch segment with a mark node with name "kernel" on the outer end + * and a mark node with name "thread" on the inner end. + * During the mapping to blocks, branching may be introduced, but only + * one of the elements in each sequence contains the "thread" mark. + * The filter of this element (and only this filter) contains + * domain elements identified by the "core" argument of the functions + * that move down this tree. + * + * Synchronization statements have a name that starts with "sync" and + * a user pointer pointing to the kernel that contains the synchronization. + * The functions inserting or detecting synchronizations take a ppcg_kernel + * argument to be able to create or identify such statements. + * They may also use two fields in this structure, the "core" field + * to move around in the tree and the "n_sync" field to make sure that + * each synchronization has a different name (within the kernel). + */ + +/* Is "node" a mark node with an identifier called "name"? + */ +static int is_marked(__isl_keep isl_schedule_node *node, const char *name) +{ + isl_id *mark; + int has_name; + + if (!node) + return -1; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) + return 0; + + mark = isl_schedule_node_mark_get_id(node); + if (!mark) + return -1; + + has_name = !strcmp(isl_id_get_name(mark), name); + isl_id_free(mark); + + return has_name; +} + +/* Is "node" a mark node with an identifier called "kernel"? + */ +int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node) +{ + return is_marked(node, "kernel"); +} + +/* Is "node" a mark node with an identifier called "shared"? + */ +static int node_is_shared(__isl_keep isl_schedule_node *node) +{ + return is_marked(node, "shared"); +} + +/* Is "node" a mark node with an identifier called "thread"? + */ +static int node_is_thread(__isl_keep isl_schedule_node *node) +{ + return is_marked(node, "thread"); +} + +/* Insert a mark node with identifier "shared" in front of "node". + */ +static __isl_give isl_schedule_node *insert_shared( + __isl_take isl_schedule_node *node) +{ + isl_ctx *ctx; + isl_id *id; + + ctx = isl_schedule_node_get_ctx(node); + id = isl_id_alloc(ctx, "shared", NULL); + node = isl_schedule_node_insert_mark(node, id); + + return node; +} + +/* Insert a "shared" mark in front of the "thread" mark + * provided the linear branch between "node" and the "thread" mark + * does not contain such a "shared" mark already. + * + * As a side effect, this function checks that the subtree at "node" + * actually contains a "thread" mark and that there is no branching + * in between "node" and this "thread" mark. + */ +__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread( + __isl_take isl_schedule_node *node) +{ + int depth0, depth; + int any_shared = 0; + + if (!node) + return NULL; + + depth0 = isl_schedule_node_get_tree_depth(node); + + for (;;) { + int is_thread; + int n; + + if (!any_shared) { + any_shared = node_is_shared(node); + if (any_shared < 0) + return isl_schedule_node_free(node); + } + is_thread = node_is_thread(node); + if (is_thread < 0) + return isl_schedule_node_free(node); + if (is_thread) + break; + n = isl_schedule_node_n_children(node); + if (n == 0) + isl_die(isl_schedule_node_get_ctx(node), + isl_error_invalid, + "no thread marker found", + return isl_schedule_node_free(node)); + if (n > 1) + isl_die(isl_schedule_node_get_ctx(node), + isl_error_invalid, + "expecting single thread marker", + return isl_schedule_node_free(node)); + + node = isl_schedule_node_child(node, 0); + } + + if (!any_shared) + node = insert_shared(node); + depth = isl_schedule_node_get_tree_depth(node); + node = isl_schedule_node_ancestor(node, depth - depth0); + + return node; +} + +/* Assuming "node" is a filter node, does it correspond to the branch + * that contains the "thread" mark, i.e., does it contain any elements + * in "core"? + */ +static int node_is_core(__isl_keep isl_schedule_node *node, + __isl_keep isl_union_set *core) +{ + int disjoint; + isl_union_set *filter; + + filter = isl_schedule_node_filter_get_filter(node); + disjoint = isl_union_set_is_disjoint(filter, core); + isl_union_set_free(filter); + if (disjoint < 0) + return -1; + + return !disjoint; +} + +/* Move to the only child of "node" that has the "thread" mark as descendant, + * where the branch containing this mark is identified by the domain elements + * in "core". + * + * If "node" is not a sequence, then it only has one child and we move + * to that single child. + * Otherwise, we check each of the filters in the children, pick + * the one that corresponds to "core" and return a pointer to the child + * of the filter node. + */ +static __isl_give isl_schedule_node *core_child( + __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core) +{ + int i, n; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence) + return isl_schedule_node_child(node, 0); + + n = isl_schedule_node_n_children(node); + for (i = 0; i < n; ++i) { + int is_core; + + node = isl_schedule_node_child(node, i); + is_core = node_is_core(node, core); + + if (is_core < 0) + return isl_schedule_node_free(node); + if (is_core) + return isl_schedule_node_child(node, 0); + + node = isl_schedule_node_parent(node); + } + + isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, + "core child not found", return isl_schedule_node_free(node)); +} + +/* Move down the branch between "kernel" and "thread" until + * the "shared" mark is reached, where the branch containing the "shared" + * mark is identified by the domain elements in "core". + */ +__isl_give isl_schedule_node *gpu_tree_move_down_to_shared( + __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core) +{ + int is_shared; + + while ((is_shared = node_is_shared(node)) == 0) + node = core_child(node, core); + if (is_shared < 0) + node = isl_schedule_node_free(node); + + return node; +} + +/* Move down the branch between "kernel" and "thread" until + * the "thread" mark is reached, where the branch containing the "thread" + * mark is identified by the domain elements in "core". + */ +__isl_give isl_schedule_node *gpu_tree_move_down_to_thread( + __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core) +{ + int is_thread; + + while ((is_thread = node_is_thread(node)) == 0) + node = core_child(node, core); + if (is_thread < 0) + node = isl_schedule_node_free(node); + + return node; +} + +/* Move up the tree underneath the "thread" mark until + * the "thread" mark is reached. + */ +__isl_give isl_schedule_node *gpu_tree_move_up_to_thread( + __isl_take isl_schedule_node *node) +{ + int is_thread; + + while ((is_thread = node_is_thread(node)) == 0) + node = isl_schedule_node_parent(node); + if (is_thread < 0) + node = isl_schedule_node_free(node); + + return node; +} + +/* Move up the tree underneath the "kernel" mark until + * the "kernel" mark is reached. + */ +__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel( + __isl_take isl_schedule_node *node) +{ + int is_kernel; + + while ((is_kernel = gpu_tree_node_is_kernel(node)) == 0) + node = isl_schedule_node_parent(node); + if (is_kernel < 0) + node = isl_schedule_node_free(node); + + return node; +} + +/* Move down from the "kernel" mark (or at least a node with schedule + * depth smaller than or equal to "depth") to a band node at schedule + * depth "depth". The "thread" mark is assumed to have a schedule + * depth greater than or equal to "depth". The branch containing the + * "thread" mark is identified by the domain elements in "core". + * + * If the desired schedule depth is in the middle of band node, + * then the band node is split into two pieces, the second piece + * at the desired schedule depth. + */ +__isl_give isl_schedule_node *gpu_tree_move_down_to_depth( + __isl_take isl_schedule_node *node, int depth, + __isl_keep isl_union_set *core) +{ + int is_shared; + int is_thread = 0; + + while (node && isl_schedule_node_get_schedule_depth(node) < depth) { + if (isl_schedule_node_get_type(node) == + isl_schedule_node_band) { + int node_depth, node_dim; + node_depth = isl_schedule_node_get_schedule_depth(node); + node_dim = isl_schedule_node_band_n_member(node); + if (node_depth + node_dim > depth) + node = isl_schedule_node_band_split(node, + depth - node_depth); + } + node = core_child(node, core); + } + while ((is_shared = node_is_shared(node)) == 0 && + (is_thread = node_is_thread(node)) == 0 && + isl_schedule_node_get_type(node) != isl_schedule_node_band) + node = core_child(node, core); + if (is_shared < 0 || is_thread < 0) + node = isl_schedule_node_free(node); + + return node; +} + +/* Create a union set containing a single set with a tuple identifier + * called "syncX" and user pointer equal to "kernel". + */ +static __isl_give isl_union_set *create_sync_domain(struct ppcg_kernel *kernel) +{ + isl_space *space; + isl_id *id; + char name[40]; + + space = isl_space_set_alloc(kernel->ctx, 0, 0); + snprintf(name, sizeof(name), "sync%d", kernel->n_sync++); + id = isl_id_alloc(kernel->ctx, name, kernel); + space = isl_space_set_tuple_id(space, isl_dim_set, id); + return isl_union_set_from_set(isl_set_universe(space)); +} + +/* Is "id" the identifier of a synchronization statement inside "kernel"? + * That is, does its name start with "sync" and does it point to "kernel"? + */ +int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel) +{ + const char *name; + + name = isl_id_get_name(id); + if (!name) + return 0; + else if (strncmp(name, "sync", 4)) + return 0; + return isl_id_get_user(id) == kernel; +} + +/* Does "domain" consist of a single set with a tuple identifier + * corresponding to a synchronization for "kernel"? + */ +static int domain_is_sync(__isl_keep isl_union_set *domain, + struct ppcg_kernel *kernel) +{ + int is_sync; + isl_id *id; + isl_set *set; + + if (isl_union_set_n_set(domain) != 1) + return 0; + set = isl_set_from_union_set(isl_union_set_copy(domain)); + id = isl_set_get_tuple_id(set); + is_sync = gpu_tree_id_is_sync(id, kernel); + isl_id_free(id); + isl_set_free(set); + + return is_sync; +} + +/* Does "node" point to a filter selecting a synchronization statement + * for "kernel"? + */ +static int node_is_sync_filter(__isl_keep isl_schedule_node *node, + struct ppcg_kernel *kernel) +{ + int is_sync; + enum isl_schedule_node_type type; + isl_union_set *domain; + + if (!node) + return -1; + type = isl_schedule_node_get_type(node); + if (type != isl_schedule_node_filter) + return 0; + domain = isl_schedule_node_filter_get_filter(node); + is_sync = domain_is_sync(domain, kernel); + isl_union_set_free(domain); + + return is_sync; +} + +/* Is "node" part of a sequence with a previous synchronization statement + * for "kernel"? + * That is, is the parent of "node" a filter such that there is + * a previous filter that picks out exactly such a synchronization statement? + */ +static int has_preceding_sync(__isl_keep isl_schedule_node *node, + struct ppcg_kernel *kernel) +{ + int found = 0; + + node = isl_schedule_node_copy(node); + node = isl_schedule_node_parent(node); + while (!found && isl_schedule_node_has_previous_sibling(node)) { + node = isl_schedule_node_previous_sibling(node); + if (!node) + break; + found = node_is_sync_filter(node, kernel); + } + if (!node) + found = -1; + isl_schedule_node_free(node); + + return found; +} + +/* Is "node" part of a sequence with a subsequent synchronization statement + * for "kernel"? + * That is, is the parent of "node" a filter such that there is + * a subsequent filter that picks out exactly such a synchronization statement? + */ +static int has_following_sync(__isl_keep isl_schedule_node *node, + struct ppcg_kernel *kernel) +{ + int found = 0; + + node = isl_schedule_node_copy(node); + node = isl_schedule_node_parent(node); + while (!found && isl_schedule_node_has_next_sibling(node)) { + node = isl_schedule_node_next_sibling(node); + if (!node) + break; + found = node_is_sync_filter(node, kernel); + } + if (!node) + found = -1; + isl_schedule_node_free(node); + + return found; +} + +/* Does the subtree rooted at "node" (which is a band node) contain + * any synchronization statement for "kernel" that precedes + * the core computation of "kernel" (identified by the elements + * in kernel->core)? + */ +static int has_sync_before_core(__isl_keep isl_schedule_node *node, + struct ppcg_kernel *kernel) +{ + int has_sync = 0; + int is_thread; + + node = isl_schedule_node_copy(node); + while ((is_thread = node_is_thread(node)) == 0) { + node = core_child(node, kernel->core); + has_sync = has_preceding_sync(node, kernel); + if (has_sync < 0 || has_sync) + break; + } + if (is_thread < 0 || !node) + has_sync = -1; + isl_schedule_node_free(node); + + return has_sync; +} + +/* Does the subtree rooted at "node" (which is a band node) contain + * any synchronization statement for "kernel" that follows + * the core computation of "kernel" (identified by the elements + * in kernel->core)? + */ +static int has_sync_after_core(__isl_keep isl_schedule_node *node, + struct ppcg_kernel *kernel) +{ + int has_sync = 0; + int is_thread; + + node = isl_schedule_node_copy(node); + while ((is_thread = node_is_thread(node)) == 0) { + node = core_child(node, kernel->core); + has_sync = has_following_sync(node, kernel); + if (has_sync < 0 || has_sync) + break; + } + if (is_thread < 0 || !node) + has_sync = -1; + isl_schedule_node_free(node); + + return has_sync; +} + +/* Insert (or extend) an extension on top of "node" that puts + * a synchronization node for "kernel" before "node". + * Return a pointer to the original node in the updated schedule tree. + */ +static __isl_give isl_schedule_node *insert_sync_before( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) +{ + isl_union_set *domain; + isl_schedule_node *graft; + + if (!node) + return NULL; + + domain = create_sync_domain(kernel); + graft = isl_schedule_node_from_domain(domain); + node = isl_schedule_node_graft_before(node, graft); + + return node; +} + +/* Insert (or extend) an extension on top of "node" that puts + * a synchronization node for "kernel" afater "node". + * Return a pointer to the original node in the updated schedule tree. + */ +static __isl_give isl_schedule_node *insert_sync_after( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) +{ + isl_union_set *domain; + isl_schedule_node *graft; + + if (!node) + return NULL; + + domain = create_sync_domain(kernel); + graft = isl_schedule_node_from_domain(domain); + node = isl_schedule_node_graft_after(node, graft); + + return node; +} + +/* Insert an extension on top of "node" that puts a synchronization node + * for "kernel" before "node" unless there already is + * such a synchronization node. + */ +__isl_give isl_schedule_node *gpu_tree_ensure_preceding_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) +{ + int has_sync; + + has_sync = has_preceding_sync(node, kernel); + if (has_sync < 0) + return isl_schedule_node_free(node); + if (has_sync) + return node; + return insert_sync_before(node, kernel); +} + +/* Insert an extension on top of "node" that puts a synchronization node + * for "kernel" after "node" unless there already is + * such a synchronization node. + */ +__isl_give isl_schedule_node *gpu_tree_ensure_following_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) +{ + int has_sync; + + has_sync = has_following_sync(node, kernel); + if (has_sync < 0) + return isl_schedule_node_free(node); + if (has_sync) + return node; + return insert_sync_after(node, kernel); +} + +/* Insert an extension on top of "node" that puts a synchronization node + * for "kernel" after "node" unless there already is such a sync node or + * "node" itself already * contains a synchronization node following + * the core computation of "kernel". + */ +__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) +{ + int has_sync; + + has_sync = has_sync_after_core(node, kernel); + if (has_sync < 0) + return isl_schedule_node_free(node); + if (has_sync) + return node; + has_sync = has_following_sync(node, kernel); + if (has_sync < 0) + return isl_schedule_node_free(node); + if (has_sync) + return node; + return insert_sync_after(node, kernel); +} + +/* Move left in the sequence on top of "node" to a synchronization node + * for "kernel". + * If "node" itself contains a synchronization node preceding + * the core computation of "kernel", then return "node" itself. + * Otherwise, if "node" does not have a preceding synchronization node, + * then create one first. + */ +__isl_give isl_schedule_node *gpu_tree_move_left_to_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) +{ + int has_sync; + int is_sync; + + has_sync = has_sync_before_core(node, kernel); + if (has_sync < 0) + return isl_schedule_node_free(node); + if (has_sync) + return node; + node = gpu_tree_ensure_preceding_sync(node, kernel); + node = isl_schedule_node_parent(node); + while ((is_sync = node_is_sync_filter(node, kernel)) == 0) + node = isl_schedule_node_previous_sibling(node); + if (is_sync < 0) + node = isl_schedule_node_free(node); + node = isl_schedule_node_child(node, 0); + + return node; +} + +/* Move right in the sequence on top of "node" to a synchronization node + * for "kernel". + * If "node" itself contains a synchronization node following + * the core computation of "kernel", then return "node" itself. + * Otherwise, if "node" does not have a following synchronization node, + * then create one first. + */ +__isl_give isl_schedule_node *gpu_tree_move_right_to_sync( + __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) +{ + int has_sync; + int is_sync; + + has_sync = has_sync_after_core(node, kernel); + if (has_sync < 0) + return isl_schedule_node_free(node); + if (has_sync) + return node; + node = gpu_tree_ensure_following_sync(node, kernel); + node = isl_schedule_node_parent(node); + while ((is_sync = node_is_sync_filter(node, kernel)) == 0) + node = isl_schedule_node_next_sibling(node); + if (is_sync < 0) + node = isl_schedule_node_free(node); + node = isl_schedule_node_child(node, 0); + + return node; +} diff --git a/polly/lib/External/ppcg/grouping.c b/polly/lib/External/ppcg/grouping.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/grouping.c @@ -0,0 +1,684 @@ +/* + * Copyright 2016 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ppcg.h" + +/* Internal data structure for use during the detection of statements + * that can be grouped. + * + * "sc" contains the original schedule constraints (not a copy). + * "dep" contains the intersection of the validity and the proximity + * constraints in "sc". It may be NULL if it has not been computed yet. + * "group_id" is the identifier for the next group that is extracted. + * + * "domain" is the set of statement instances that belong to any of the groups. + * "contraction" maps the elements of "domain" to the corresponding group + * instances. + * "schedule" schedules the statements in each group relatively to each other. + * These last three fields are NULL if no groups have been found so far. + */ +struct ppcg_grouping { + isl_schedule_constraints *sc; + + isl_union_map *dep; + int group_id; + + isl_union_set *domain; + isl_union_pw_multi_aff *contraction; + isl_schedule *schedule; +}; + +/* Clear all memory allocated by "grouping". + */ +static void ppcg_grouping_clear(struct ppcg_grouping *grouping) +{ + isl_union_map_free(grouping->dep); + isl_union_set_free(grouping->domain); + isl_union_pw_multi_aff_free(grouping->contraction); + isl_schedule_free(grouping->schedule); +} + +/* Compute the intersection of the proximity and validity dependences + * in grouping->sc and store the result in grouping->dep, unless + * this intersection has been computed before. + */ +static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping) +{ + isl_union_map *validity, *proximity; + + if (grouping->dep) + return isl_stat_ok; + + validity = isl_schedule_constraints_get_validity(grouping->sc); + proximity = isl_schedule_constraints_get_proximity(grouping->sc); + grouping->dep = isl_union_map_intersect(validity, proximity); + + if (!grouping->dep) + return isl_stat_error; + + return isl_stat_ok; +} + +/* Information extracted from one or more consecutive leaves + * in the input schedule. + * + * "list" contains the sets of statement instances in the leaves, + * one element in the list for each original leaf. + * "domain" contains the union of the sets in "list". + * "prefix" contains the prefix schedule of these elements. + */ +struct ppcg_grouping_leaf { + isl_union_set *domain; + isl_union_set_list *list; + isl_multi_union_pw_aff *prefix; +}; + +/* Free all memory allocated for "leaves". + */ +static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[]) +{ + int i; + + if (!leaves) + return; + + for (i = 0; i < n; ++i) { + isl_union_set_free(leaves[i].domain); + isl_union_set_list_free(leaves[i].list); + isl_multi_union_pw_aff_free(leaves[i].prefix); + } + + free(leaves); +} + +/* Short-hand for retrieving the prefix schedule at "node" + * in the form of an isl_multi_union_pw_aff. + */ +static __isl_give isl_multi_union_pw_aff *get_prefix( + __isl_keep isl_schedule_node *node) +{ + return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); +} + +/* Return an array of "n" elements with information extracted from + * the "n" children of "node" starting at "first", all of which + * are known to be filtered leaves. + */ +struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node, + int first, int n) +{ + int i; + isl_ctx *ctx; + struct ppcg_grouping_leaf *leaves; + + if (!node) + return NULL; + + ctx = isl_schedule_node_get_ctx(node); + leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n); + if (!leaves) + return NULL; + + for (i = 0; i < n; ++i) { + isl_schedule_node *child; + isl_union_set *domain; + + child = isl_schedule_node_get_child(node, first + i); + child = isl_schedule_node_child(child, 0); + domain = isl_schedule_node_get_domain(child); + leaves[i].domain = isl_union_set_copy(domain); + leaves[i].list = isl_union_set_list_from_union_set(domain); + leaves[i].prefix = get_prefix(child); + isl_schedule_node_free(child); + } + + return leaves; +} + +/* Internal data structure used by merge_leaves. + * + * "src" and "dst" point to the two consecutive leaves that are + * under investigation for being merged. + * "merge" is initially set to 0 and is set to 1 as soon as + * it turns out that it is useful to merge the two leaves. + */ +struct ppcg_merge_leaves_data { + int merge; + struct ppcg_grouping_leaf *src; + struct ppcg_grouping_leaf *dst; +}; + +/* Given a relation "map" between instances of two statements A and B, + * does it relate every instance of A (according to the domain of "src") + * to every instance of B (according to the domain of "dst")? + */ +static isl_bool covers_src_and_dst(__isl_keep isl_map *map, + struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst) +{ + isl_space *space; + isl_set *set1, *set2; + isl_bool is_subset; + + space = isl_space_domain(isl_map_get_space(map)); + set1 = isl_union_set_extract_set(src->domain, space); + set2 = isl_map_domain(isl_map_copy(map)); + is_subset = isl_set_is_subset(set1, set2); + isl_set_free(set1); + isl_set_free(set2); + if (is_subset < 0 || !is_subset) + return is_subset; + + space = isl_space_range(isl_map_get_space(map)); + set1 = isl_union_set_extract_set(dst->domain, space); + set2 = isl_map_range(isl_map_copy(map)); + is_subset = isl_set_is_subset(set1, set2); + isl_set_free(set1); + isl_set_free(set2); + + return is_subset; +} + +/* Given a relation "map" between instances of two statements A and B, + * are pairs of related instances executed together in the input schedule? + * That is, is each pair of instances assigned the same value + * by the corresponding prefix schedules? + * + * In particular, select the subset of "map" that has pairs of elements + * with the same value for the prefix schedules and then check + * if "map" is still a subset of the result. + */ +static isl_bool matches_prefix(__isl_keep isl_map *map, + struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst) +{ + isl_union_map *umap, *equal; + isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix; + isl_bool is_subset; + + src_prefix = isl_multi_union_pw_aff_copy(src->prefix); + dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix); + prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix); + + umap = isl_union_map_from_map(isl_map_copy(map)); + equal = isl_union_map_copy(umap); + equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix); + + is_subset = isl_union_map_is_subset(umap, equal); + + isl_union_map_free(umap); + isl_union_map_free(equal); + + return is_subset; +} + +/* Given a set of validity and proximity schedule constraints "map" + * between statements in consecutive leaves in a valid schedule, + * should the two leaves be merged into one? + * + * In particular, the two are merged if the constraints form + * a bijection between every instance of the first statement and + * every instance of the second statement. Moreover, each + * pair of such dependent instances needs to be executed consecutively + * in the input schedule. That is, they need to be assigned + * the same value by their prefix schedules. + * + * What this means is that for each instance of the first statement + * there is exactly one instance of the second statement that + * is executed immediately after the instance of the first statement and + * that, moreover, both depends on this statement instance and + * should be brought as close as possible to this statement instance. + * In other words, it is both possible to execute the two instances + * together (according to the input schedule) and desirable to do so + * (according to the validity and proximity schedule constraints). + */ +static isl_stat check_merge(__isl_take isl_map *map, void *user) +{ + struct ppcg_merge_leaves_data *data = user; + isl_bool ok; + + ok = covers_src_and_dst(map, data->src, data->dst); + if (ok >= 0 && ok) + ok = isl_map_is_bijective(map); + if (ok >= 0 && ok) + ok = matches_prefix(map, data->src, data->dst); + + isl_map_free(map); + + if (ok < 0) + return isl_stat_error; + if (!ok) + return isl_stat_ok; + + data->merge = 1; + return isl_stat_error; +} + +/* Merge the leaves at position "pos" and "pos + 1" in "leaves". + */ +static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[], int pos) +{ + int i; + + leaves[pos].domain = isl_union_set_union(leaves[pos].domain, + leaves[pos + 1].domain); + leaves[pos].list = isl_union_set_list_concat(leaves[pos].list, + leaves[pos + 1].list); + leaves[pos].prefix = isl_multi_union_pw_aff_union_add( + leaves[pos].prefix, leaves[pos + 1].prefix); + for (i = pos + 1; i + 1 < n; ++i) + leaves[i] = leaves[i + 1]; + leaves[n - 1].domain = NULL; + leaves[n - 1].list = NULL; + leaves[n - 1].prefix = NULL; + + if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix) + return isl_stat_error; + + return isl_stat_ok; +} + +/* Merge pairs of consecutive leaves in "leaves" taking into account + * the intersection of validity and proximity schedule constraints "dep". + * + * If a leaf has been merged with the next leaf, then the combination + * is checked again for merging with the next leaf. + * That is, if the leaves are A, B and C, then B may not have been + * merged with C, but after merging A and B, it could still be useful + * to merge the combination AB with C. + * + * Two leaves A and B are merged if there are instances of at least + * one pair of statements, one statement in A and one B, such that + * the validity and proximity schedule constraints between them + * make them suitable for merging according to check_merge. + * + * Return the final number of leaves in the sequence, or -1 on error. + */ +static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[], + __isl_keep isl_union_map *dep) +{ + int i; + struct ppcg_merge_leaves_data data; + + for (i = n - 1; i >= 0; --i) { + isl_union_map *dep_i; + isl_stat ok; + + if (i + 1 >= n) + continue; + + dep_i = isl_union_map_copy(dep); + dep_i = isl_union_map_intersect_domain(dep_i, + isl_union_set_copy(leaves[i].domain)); + dep_i = isl_union_map_intersect_range(dep_i, + isl_union_set_copy(leaves[i + 1].domain)); + data.merge = 0; + data.src = &leaves[i]; + data.dst = &leaves[i + 1]; + ok = isl_union_map_foreach_map(dep_i, &check_merge, &data); + isl_union_map_free(dep_i); + if (ok < 0 && !data.merge) + return -1; + if (!data.merge) + continue; + if (merge_pair(n, leaves, i) < 0) + return -1; + --n; + ++i; + } + + return n; +} + +/* Construct a schedule with "domain" as domain, that executes + * the elements of "list" in order (as a sequence). + */ +static __isl_give isl_schedule *schedule_from_domain_and_list( + __isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list) +{ + isl_schedule *schedule; + isl_schedule_node *node; + + schedule = isl_schedule_from_domain(isl_union_set_copy(domain)); + node = isl_schedule_get_root(schedule); + isl_schedule_free(schedule); + node = isl_schedule_node_child(node, 0); + list = isl_union_set_list_copy(list); + node = isl_schedule_node_insert_sequence(node, list); + schedule = isl_schedule_node_get_schedule(node); + isl_schedule_node_free(node); + + return schedule; +} + +/* Construct a unique identifier for a group in "grouping". + * + * The name is of the form G_n, with n the first value starting at + * grouping->group_id that does not result in an identifier + * that is already in use in the domain of the original schedule + * constraints. + */ +static isl_id *construct_group_id(struct ppcg_grouping *grouping, + __isl_take isl_space *space) +{ + isl_ctx *ctx; + isl_id *id; + isl_bool empty; + isl_union_set *domain; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + domain = isl_schedule_constraints_get_domain(grouping->sc); + + do { + char buffer[20]; + isl_id *id; + isl_set *set; + + snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id); + grouping->group_id++; + id = isl_id_alloc(ctx, buffer, NULL); + space = isl_space_set_tuple_id(space, isl_dim_set, id); + set = isl_union_set_extract_set(domain, isl_space_copy(space)); + empty = isl_set_plain_is_empty(set); + isl_set_free(set); + } while (empty >= 0 && !empty); + + if (empty < 0) + space = isl_space_free(space); + + id = isl_space_get_tuple_id(space, isl_dim_set); + + isl_space_free(space); + isl_union_set_free(domain); + + return id; +} + +/* Construct a contraction from "prefix" and "domain" for a new group + * in "grouping". + * + * The values of the prefix schedule "prefix" are used as instances + * of the new group. The identifier of the group is constructed + * in such a way that it does not conflict with those of earlier + * groups nor with statements in the domain of the original + * schedule constraints. + * The isl_multi_union_pw_aff "prefix" then simply needs to be + * converted to an isl_union_pw_multi_aff. However, this is not + * possible if "prefix" is zero-dimensional, so in this case, + * a contraction is constructed from "domain" instead. + */ +static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain( + struct ppcg_grouping *grouping, + __isl_keep isl_multi_union_pw_aff *prefix, + __isl_keep isl_union_set *domain) +{ + isl_id *id; + isl_space *space; + int dim; + + space = isl_multi_union_pw_aff_get_space(prefix); + if (!space) + return NULL; + dim = isl_space_dim(space, isl_dim_set); + id = construct_group_id(grouping, space); + if (dim == 0) { + isl_multi_val *mv; + + space = isl_multi_union_pw_aff_get_space(prefix); + space = isl_space_set_tuple_id(space, isl_dim_set, id); + mv = isl_multi_val_zero(space); + domain = isl_union_set_copy(domain); + return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv); + } + prefix = isl_multi_union_pw_aff_copy(prefix); + prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id); + return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix); +} + +/* Extend "grouping" with groups corresponding to merged + * leaves in the list of potentially merged leaves "leaves". + * + * The "list" field of each element in "leaves" contains a list + * of the instances sets of the original leaves that have been + * merged into this element. If at least two of the original leaves + * have been merged into a given element, then add the corresponding + * group to "grouping". + * In particular, the domain is extended with the statement instances + * of the merged leaves, the contraction is extended with a mapping + * of these statement instances to instances of a new group and + * the schedule is extended with a schedule that executes + * the statement instances according to the order of the leaves + * in which they appear. + * Since the instances of the groups should already be scheduled apart + * in the schedule into which this schedule will be plugged in, + * the schedules of the individual groups are combined independently + * of each other (as a set). + */ +static isl_stat add_groups(struct ppcg_grouping *grouping, + int n, struct ppcg_grouping_leaf leaves[]) +{ + int i; + + for (i = 0; i < n; ++i) { + int n_leaf; + isl_schedule *schedule; + isl_union_set *domain; + isl_union_pw_multi_aff *upma; + + n_leaf = isl_union_set_list_n_union_set(leaves[i].list); + if (n_leaf < 0) + return isl_stat_error; + if (n_leaf <= 1) + continue; + schedule = schedule_from_domain_and_list(leaves[i].domain, + leaves[i].list); + upma = group_contraction_from_prefix_and_domain(grouping, + leaves[i].prefix, leaves[i].domain); + + domain = isl_union_set_copy(leaves[i].domain); + if (grouping->domain) { + domain = isl_union_set_union(domain, grouping->domain); + upma = isl_union_pw_multi_aff_union_add(upma, + grouping->contraction); + schedule = isl_schedule_set(schedule, + grouping->schedule); + } + grouping->domain = domain; + grouping->contraction = upma; + grouping->schedule = schedule; + + if (!grouping->domain || !grouping->contraction || + !grouping->schedule) + return isl_stat_error; + } + + return isl_stat_ok; +} + +/* Look for any pairs of consecutive leaves among the "n" children of "node" + * starting at "first" that should be merged together. + * Store the results in "grouping". + * + * First make sure the intersection of validity and proximity + * schedule constraints is available and extract the required + * information from the "n" leaves. + * Then try and merge consecutive leaves based on the validity + * and proximity constraints. + * If any pairs were successfully merged, then add groups + * corresponding to the merged leaves to "grouping". + */ +static isl_stat group_subsequence(__isl_keep isl_schedule_node *node, + int first, int n, struct ppcg_grouping *grouping) +{ + int n_merge; + struct ppcg_grouping_leaf *leaves; + + if (ppcg_grouping_compute_dep(grouping) < 0) + return isl_stat_error; + + leaves = extract_leaves(node, first, n); + if (!leaves) + return isl_stat_error; + + n_merge = merge_leaves(n, leaves, grouping->dep); + if (n_merge >= 0 && n_merge < n && + add_groups(grouping, n_merge, leaves) < 0) + return isl_stat_error; + + ppcg_grouping_leaf_free(n, leaves); + + return isl_stat_ok; +} + +/* If "node" is a sequence, then check if it has any consecutive + * leaves that should be merged together and store the results + * in "grouping". + * + * In particular, call group_subsequence on each consecutive + * sequence of (filtered) leaves among the children of "node". + */ +static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user) +{ + int i, n, first; + struct ppcg_grouping *grouping = user; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence) + return isl_bool_true; + + n = isl_schedule_node_n_children(node); + if (n < 0) + return isl_bool_error; + + first = -1; + for (i = 0; i < n; ++i) { + isl_schedule_node *child; + enum isl_schedule_node_type type; + + child = isl_schedule_node_get_child(node, i); + child = isl_schedule_node_child(child, 0); + type = isl_schedule_node_get_type(child); + isl_schedule_node_free(child); + + if (first >= 0 && type != isl_schedule_node_leaf) { + if (group_subsequence(node, first, i - first, + grouping) < 0) + return isl_bool_error; + first = -1; + } + if (first < 0 && type == isl_schedule_node_leaf) + first = i; + } + if (first >= 0) { + if (group_subsequence(node, first, n - first, grouping) < 0) + return isl_bool_error; + } + + return isl_bool_true; +} + +/* Complete "grouping" to cover all statement instances in the domain + * of grouping->sc. + * + * In particular, grouping->domain is set to the full set of statement + * instances; group->contraction is extended with an identity + * contraction on the additional instances and group->schedule + * is extended with an independent schedule on those additional instances. + * In the extension of group->contraction, the additional instances + * are split into those belong to different statements and those + * that belong to some of the same statements. The first group + * is replaced by its universe in order to simplify the contraction extension. + */ +static void complete_grouping(struct ppcg_grouping *grouping) +{ + isl_union_set *domain, *left, *overlap; + isl_union_pw_multi_aff *upma; + isl_schedule *schedule; + + domain = isl_schedule_constraints_get_domain(grouping->sc); + left = isl_union_set_subtract(isl_union_set_copy(domain), + isl_union_set_copy(grouping->domain)); + schedule = isl_schedule_from_domain(isl_union_set_copy(left)); + schedule = isl_schedule_set(schedule, grouping->schedule); + grouping->schedule = schedule; + + overlap = isl_union_set_universe(grouping->domain); + grouping->domain = domain; + overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap); + left = isl_union_set_subtract(left, isl_union_set_copy(overlap)); + left = isl_union_set_universe(left); + left = isl_union_set_union(left, overlap); + upma = isl_union_set_identity_union_pw_multi_aff(left); + upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction); + grouping->contraction = upma; +} + +/* Compute a schedule on the domain of "sc" that respects the schedule + * constraints in "sc". + * + * "schedule" is a known correct schedule that is used to combine + * groups of statements if options->group_chains is set. + * In particular, statements that are executed consecutively in a sequence + * in this schedule and where all instances of the second depend on + * the instance of the first that is executed in the same iteration + * of outer band nodes are grouped together into a single statement. + * The schedule constraints are then mapped to these groups of statements + * and the resulting schedule is expanded again to refer to the original + * statements. + */ +__isl_give isl_schedule *ppcg_compute_schedule( + __isl_take isl_schedule_constraints *sc, + __isl_keep isl_schedule *schedule, struct ppcg_options *options) +{ + struct ppcg_grouping grouping = { sc }; + isl_union_pw_multi_aff *contraction; + isl_union_map *umap; + isl_schedule *res, *expansion; + + if (!options->group_chains) + return isl_schedule_constraints_compute_schedule(sc); + + grouping.group_id = 0; + if (isl_schedule_foreach_schedule_node_top_down(schedule, + &detect_groups, &grouping) < 0) + goto error; + if (!grouping.contraction) { + ppcg_grouping_clear(&grouping); + return isl_schedule_constraints_compute_schedule(sc); + } + complete_grouping(&grouping); + contraction = isl_union_pw_multi_aff_copy(grouping.contraction); + umap = isl_union_map_from_union_pw_multi_aff(contraction); + + sc = isl_schedule_constraints_apply(sc, umap); + + res = isl_schedule_constraints_compute_schedule(sc); + + contraction = isl_union_pw_multi_aff_copy(grouping.contraction); + expansion = isl_schedule_copy(grouping.schedule); + res = isl_schedule_expand(res, contraction, expansion); + + ppcg_grouping_clear(&grouping); + return res; +error: + ppcg_grouping_clear(&grouping); + isl_schedule_constraints_free(sc); + return NULL; +} diff --git a/polly/lib/External/ppcg/hybrid.h b/polly/lib/External/ppcg/hybrid.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/hybrid.h @@ -0,0 +1,41 @@ +#ifndef HYBRID_H +#define HYBRID_H + +#include +#include + +#include "ppcg.h" + +struct ppcg_ht_bounds; +typedef struct ppcg_ht_bounds ppcg_ht_bounds; + +struct ppcg_ht_phase; +typedef struct ppcg_ht_phase ppcg_ht_phase; + +isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node); +isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node); + +__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop, + __isl_keep isl_schedule_node *node); +void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds); +isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds); +isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds, + __isl_keep isl_multi_val *sizes); +__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling( + __isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes, + __isl_take isl_schedule_node *node, struct ppcg_options *options); +__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free( + __isl_take ppcg_ht_bounds *bounds); + +__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark( + __isl_keep isl_schedule_node *node); +__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node); +__isl_give isl_schedule_node *hybrid_tile_foreach_phase( + __isl_take isl_schedule_node *node, + __isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node, + void *user), void *user); +__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks( + __isl_take isl_schedule_node *node); + +#endif diff --git a/polly/lib/External/ppcg/hybrid.c b/polly/lib/External/ppcg/hybrid.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/hybrid.c @@ -0,0 +1,2242 @@ +/* + * Copyright 2013 Ecole Normale Superieure + * Copyright 2015 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hybrid.h" +#include "schedule.h" + +/* The hybrid tiling implemented in this file is based on + * Grosser et al., "Hybrid Hexagonal/Classical Tiling for GPUs". + */ + +/* Bounds on relative dependence distances in input to hybrid tiling. + * upper is an upper bound on the relative dependence distances + * in the first space dimension + * -lower is a lower bound on the relative dependence distances + * in all space dimensions. + * + * In particular, + * + * d_i >= -lower_i d_0 + * and + * d_1 <= upper d_0 + * + * for each dependence distance vector d, where d_1 is the component + * corresponding to the first space dimension. + * + * upper and lower are always non-negative. + * Some of the values may be NaN if no bound could be found. + */ +struct ppcg_ht_bounds { + isl_val *upper; + isl_multi_val *lower; +}; + +/* Free "bounds" along with all its fields. + */ +__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free( + __isl_take ppcg_ht_bounds *bounds) +{ + if (!bounds) + return NULL; + isl_val_free(bounds->upper); + isl_multi_val_free(bounds->lower); + free(bounds); + + return NULL; +} + +/* Create a ppcg_ht_bounds object for a band living in "space". + * The bounds are initialized to NaN. + */ +__isl_give ppcg_ht_bounds *ppcg_ht_bounds_alloc(__isl_take isl_space *space) +{ + int i, n; + isl_ctx *ctx; + ppcg_ht_bounds *bounds; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + bounds = isl_alloc_type(ctx, struct ppcg_ht_bounds); + if (!bounds) + goto error; + bounds->upper = isl_val_nan(ctx); + bounds->lower = isl_multi_val_zero(space); + n = isl_multi_val_dim(bounds->lower, isl_dim_set); + for (i = 0; i < n; ++i) { + isl_val *v = isl_val_copy(bounds->upper); + bounds->lower = isl_multi_val_set_val(bounds->lower, i, v); + } + + if (!bounds->lower || !bounds->upper) + return ppcg_ht_bounds_free(bounds); + + return bounds; +error: + isl_space_free(space); + return NULL; +} + +void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds) +{ + if (!bounds) + return; + + fprintf(stderr, "lower: "); + isl_multi_val_dump(bounds->lower); + fprintf(stderr, "upper: "); + isl_val_dump(bounds->upper); +} + +/* Return the upper bound on the relative dependence distances + * in the first space dimension. + */ +__isl_give isl_val *ppcg_ht_bounds_get_upper(__isl_keep ppcg_ht_bounds *bounds) +{ + if (!bounds) + return NULL; + return isl_val_copy(bounds->upper); +} + +/* Replace the upper bound on the relative dependence distances + * in the first space dimension by "upper". + */ +__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_upper( + __isl_take ppcg_ht_bounds *bounds, __isl_take isl_val *upper) +{ + if (!bounds || !upper) + goto error; + isl_val_free(bounds->upper); + bounds->upper = upper; + return bounds; +error: + ppcg_ht_bounds_free(bounds); + isl_val_free(upper); + return NULL; +} + +/* Return the lower bound on the relative dependence distances + * in space dimension "pos". + */ +__isl_give isl_val *ppcg_ht_bounds_get_lower(__isl_keep ppcg_ht_bounds *bounds, + int pos) +{ + if (!bounds) + return NULL; + return isl_multi_val_get_val(bounds->lower, pos); +} + +/* Replace the lower bound on the relative dependence distances + * in space dimension "pos" by "lower". + */ +__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_lower( + __isl_take ppcg_ht_bounds *bounds, int pos, __isl_take isl_val *lower) +{ + if (!bounds || !lower) + goto error; + bounds->lower = isl_multi_val_set_val(bounds->lower, pos, lower); + if (!bounds->lower) + return ppcg_ht_bounds_free(bounds); + return bounds; +error: + ppcg_ht_bounds_free(bounds); + isl_val_free(lower); + return NULL; +} + +/* Can the bounds on relative dependence distances recorded in "bounds" + * be used to perform hybrid tiling? + * In particular, have appropriate lower and upper bounds been found? + * Any NaN indicates that no corresponding bound was found. + */ +isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds) +{ + isl_bool is_nan; + int i, n; + + if (!bounds) + return isl_bool_error; + is_nan = isl_val_is_nan(bounds->upper); + if (is_nan < 0) + return isl_bool_error; + if (is_nan) + return isl_bool_false; + + n = isl_multi_val_dim(bounds->lower, isl_dim_set); + for (i = 0; i < n; ++i) { + isl_val *v; + + v = isl_multi_val_get_val(bounds->lower, i); + is_nan = isl_val_is_nan(v); + if (is_nan < 0) + return isl_bool_error; + if (is_nan) + return isl_bool_false; + isl_val_free(v); + } + + return isl_bool_true; +} + +/* Structure that represents the basic hexagonal tiling, + * along with information that is needed to perform the hybrid tiling. + * + * "bounds" are the bounds on the dependence distances that + * define the hexagonal shape and the required skewing in the remaining + * space dimensions. + * + * "input_node" points to the input pair of band nodes. + * "input_schedule" is the partial schedule of this input pair of band nodes. + * The space of this schedule is [P -> C], where P is the space + * of the parent node and C is the space of the child node. + * + * "space_sizes" represent the total size of a tile for the space + * dimensions, i.e., those corresponding to the child node. + * The space of "space_sizes" is C. + * If S_0 is the original tile size in the first space dimension, + * then the first entry of "space_sizes" is equal to + * W = 2*S_0 + floor(d_l h) + floor(d_u h). + * The remaining entries are the same as in the original tile sizes. + * + * The basic hexagonal tiling "hex" is defined + * in a "ts" (time-space) space and corresponds to the phase-1 tiles. + * "time_tile" maps the "ts" space to outer time tile. + * Is is equal to ts[t, s] -> floor(t/(2 * S_t)), with S_t the original tile + * size corresponding to the parent node. + * "local_time" maps the "ts" space to the time dimension inside each tile. + * It is equal to ts[t, s] -> t mod (2 S_t), with S_t the original tile + * size corresponding to the parent node. + * "shift_space" shifts the tiles at time tile T = floor(t/(2 S_t)) + * in the space dimension such that they align to a multiple of W. + * It is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W, + * with shift_s = S_0 + floor(d_u h). + * "shift_phase" is the shift taken to go from phase 0 to phase 1 + * It is equal to ts[t, s] -> ts[t + S_t, s + shift_s], + * with shift_s = S_0 + floor(d_u h). + * + * "project_ts" projects the space of the input schedule to the ts-space. + * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0]. + */ +struct ppcg_ht_tiling { + int ref; + + ppcg_ht_bounds *bounds; + isl_schedule_node *input_node; + isl_multi_union_pw_aff *input_schedule; + + isl_multi_val *space_sizes; + + isl_aff *time_tile; + isl_aff *local_time; + isl_aff *shift_space; + isl_multi_aff *shift_phase; + isl_set *hex; + + isl_multi_aff *project_ts; +}; +typedef struct ppcg_ht_tiling ppcg_ht_tiling; + +/* Return the space of the pair of band nodes that form the input + * to the hybrid tiling. + * In particular, return the space [P -> C], where P is the space + * of the parent node and C is the space of the child node. + */ +__isl_give isl_space *ppcg_ht_tiling_get_input_space( + __isl_keep ppcg_ht_tiling *tile) +{ + if (!tile) + return NULL; + + return isl_multi_union_pw_aff_get_space(tile->input_schedule); +} + +/* Remove a reference to "tile" and free "tile" along with all its fields + * as soon as the reference count drops to zero. + */ +static __isl_null ppcg_ht_tiling *ppcg_ht_tiling_free( + __isl_take ppcg_ht_tiling *tiling) +{ + if (!tiling) + return NULL; + if (--tiling->ref > 0) + return NULL; + + ppcg_ht_bounds_free(tiling->bounds); + isl_schedule_node_free(tiling->input_node); + isl_multi_union_pw_aff_free(tiling->input_schedule); + isl_multi_val_free(tiling->space_sizes); + isl_aff_free(tiling->time_tile); + isl_aff_free(tiling->local_time); + isl_aff_free(tiling->shift_space); + isl_multi_aff_free(tiling->shift_phase); + isl_set_free(tiling->hex); + isl_multi_aff_free(tiling->project_ts); + free(tiling); + + return NULL; +} + +/* Return a new reference to "tiling". + */ +__isl_give ppcg_ht_tiling *ppcg_ht_tiling_copy( + __isl_keep ppcg_ht_tiling *tiling) +{ + if (!tiling) + return NULL; + + tiling->ref++; + return tiling; +} + +/* Return the isl_ctx to which "tiling" belongs. + */ +isl_ctx *ppcg_ht_tiling_get_ctx(__isl_keep ppcg_ht_tiling *tiling) +{ + if (!tiling) + return NULL; + + return isl_multi_union_pw_aff_get_ctx(tiling->input_schedule); +} + +/* Representation of one of the two phases of hybrid tiling. + * + * "tiling" points to the shared tiling data. + * + * "time_tile", "local_time" and "shift_space" are equal to the corresponding + * fields of "tiling", pulled back to the input space. + * In case of phase 0, these expressions have also been moved + * from phase 1 to phase 0. + * + * "domain" contains the hexagonal tiling of this phase. + * + * "space_shift" is the shift that should be added to the space band + * in order to be able to apply rectangular tiling to the space. + * For phase 1, it is equal to + * + * [P[t] -> C[s_0, s_i]] -> C[(-(2 * shift_s)*T) % W, dl_i * u] + * + * with shift_s = S_0 + floor(d_u h), + * T equal to "time_tile" and u equal to "local_time". + * For phase 0, it is equal to + * + * [P[t] -> C[s_0, s_i]] -> C[shift_s + (-(2 * shift_s)*T) % W, dl_i * u] + * + * "space_tile" is the space tiling. It is equal to + * + * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size] + */ +struct ppcg_ht_phase { + ppcg_ht_tiling *tiling; + + isl_aff *time_tile; + isl_aff *local_time; + isl_aff *shift_space; + isl_set *domain; + + isl_multi_aff *space_shift; + isl_multi_aff *space_tile; +}; + +/* Free "phase" along with all its fields. + */ +static __isl_null ppcg_ht_phase *ppcg_ht_phase_free( + __isl_take ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + ppcg_ht_tiling_free(phase->tiling); + isl_aff_free(phase->time_tile); + isl_aff_free(phase->local_time); + isl_aff_free(phase->shift_space); + isl_set_free(phase->domain); + isl_multi_aff_free(phase->space_shift); + isl_multi_aff_free(phase->space_tile); + free(phase); + + return NULL; +} + +/* Wrapper around ppcg_ht_phase_free for use as an argument + * to isl_id_set_free_user. + */ +static void ppcg_ht_phase_free_wrap(void *user) +{ + ppcg_ht_phase *phase = user; + + ppcg_ht_phase_free(phase); +} + +/* Return the domain of hybrid tiling phase "phase". + */ +static __isl_give isl_set *ppcg_ht_phase_get_domain(ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + return isl_set_copy(phase->domain); +} + +/* Return the space of the pair of band nodes that form the input + * to the hybrid tiling of which "phase" is a phase. + * In particular, return the space [P -> C], where P is the space + * of the parent node and C is the space of the child node. + */ +static __isl_give isl_space *ppcg_ht_phase_get_input_space( + __isl_keep ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + return ppcg_ht_tiling_get_input_space(phase->tiling); +} + +/* Construct the lower left constraint of the hexagonal tile, i.e., + * + * du a - b <= (2h+1) du - duh + * -du a + b + (2h+1) du - duh >= 0 + * + * where duh = floor(du * h). + * + * This constraint corresponds to (6) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_lower_left(__isl_take isl_local_space *ls, + __isl_keep isl_val *h, __isl_keep isl_val *du, __isl_keep isl_val *duh) +{ + isl_val *v; + isl_aff *aff; + + v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); + v = isl_val_mul(v, isl_val_copy(du)); + v = isl_val_sub(v, isl_val_copy(duh)); + aff = isl_aff_val_on_domain(ls, v); + v = isl_val_neg(isl_val_copy(du)); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the lower constraint of the hexagonal tile, i.e., + * + * a <= 2h+1 + * -a + 2h+1 >= 0 + * + * This constraint corresponds to (7) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_lower(__isl_take isl_local_space *ls, + __isl_keep isl_val *h) +{ + isl_val *v; + isl_aff *aff; + + v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); + aff = isl_aff_val_on_domain(ls, v); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 0, -1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the lower right constraint of the hexagonal tile, i.e., + * + * dl a + b <= (2h+1) dl + duh + (s0-1) + * -dl a - b + (2h+1) dl + duh + (s0-1) >= 0 + * + * where duh = floor(du * h). + * + * This constraint corresponds to (8) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_lower_right( + __isl_take isl_local_space *ls, __isl_keep isl_val *h, + __isl_keep isl_val *s0, __isl_keep isl_val *dl, __isl_keep isl_val *duh) +{ + isl_val *v; + isl_aff *aff; + + v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); + v = isl_val_mul(v, isl_val_copy(dl)); + v = isl_val_add(v, isl_val_copy(duh)); + v = isl_val_add(v, isl_val_copy(s0)); + v = isl_val_sub_ui(v, 1); + aff = isl_aff_val_on_domain(ls, v); + v = isl_val_neg(isl_val_copy(dl)); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the upper left constraint of the hexagonal tile, i.e., + * + * dl a + b >= h dl - (d - 1)/d with d = den(dl) + * dl a + b - h dl + (d - 1)/d >= 0 + * + * This constraint corresponds to (10) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_upper_left(__isl_take isl_local_space *ls, + __isl_keep isl_val *h, __isl_keep isl_val *dl) +{ + isl_val *v, *d; + isl_aff *aff; + + d = isl_val_get_den_val(dl); + v = isl_val_sub_ui(isl_val_copy(d), 1); + v = isl_val_div(v, d); + v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(dl))); + aff = isl_aff_val_on_domain(ls, v); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(dl)); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the upper right constraint of the hexagonal tile, i.e., + * + * du a - b >= du h - duh - (s0-1) - dlh - (d - 1)/d with d = den(du) + * du a - b - du h + duh + (s0-1) + dlh + (d - 1)/d >= 0 + * + * where dlh = floor(dl * h) and duh = floor(du * h). + * + * This constraint corresponds to (12) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_upper_right( + __isl_take isl_local_space *ls, __isl_keep isl_val *h, + __isl_keep isl_val *s0, __isl_keep isl_val *du, + __isl_keep isl_val *dlh, __isl_keep isl_val *duh) +{ + isl_val *v, *d; + isl_aff *aff; + + d = isl_val_get_den_val(du); + v = isl_val_sub_ui(isl_val_copy(d), 1); + v = isl_val_div(v, d); + v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(du))); + v = isl_val_add(v, isl_val_copy(duh)); + v = isl_val_add(v, isl_val_copy(dlh)); + v = isl_val_add(v, isl_val_copy(s0)); + v = isl_val_sub_ui(v, 1); + aff = isl_aff_val_on_domain(ls, v); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(du)); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the uppper constraint of the hexagonal tile, i.e., + * + * a >= 0 + * + * This constraint corresponds to (13) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_upper(__isl_take isl_local_space *ls) +{ + isl_aff *aff; + + aff = isl_aff_var_on_domain(ls, isl_dim_set, 0); + + return isl_inequality_from_aff(aff); +} + +/* Construct the basic hexagonal tile shape. + * "space" is the 2D space in which the hexagon should be constructed. + * h is st-1, with st the tile size in the time dimension + * s0 is the tile size in the space dimension + * dl is a bound on the negative relative dependence distances, i.e., + * + * d_s >= -dl d_t + * + * du is a bound on the positive relative dependence distances, i.e., + * + * d_s <= du d_t + * + * with (d_t,d_s) any dependence distance vector. + * dlh = floor(dl * h) + * duh = floor(du * h) + * + * The shape of the hexagon is as follows: + * + * 0 dlh dlh+s0-1 + * ______ __ + * 0 / \_ / + * / \_ / + * h / \ ______ / + * h+1 \_ // \\_ + * \_ // \\_ + * 2h+1 \______// \\ + * 0 duh duh+s0-1 + * duh+s0-1+dlh + * duh+s0-1+dlh+1+s0+1 + * + * The next hexagon is shifted by duh + dlh + 2 * s0. + * + * The slope of the "/" constraints is dl. + * The slope of the "\_" constraints is du. + */ +static __isl_give isl_set *compute_hexagon(__isl_take isl_space *space, + __isl_keep isl_val *h, __isl_keep isl_val *s0, + __isl_keep isl_val *dl, __isl_keep isl_val *du, + __isl_keep isl_val *dlh, __isl_keep isl_val *duh) +{ + isl_local_space *ls; + isl_constraint *c; + isl_basic_set *bset; + + ls = isl_local_space_from_space(space); + + c = hex_lower_left(isl_local_space_copy(ls), h, du, duh); + bset = isl_basic_set_from_constraint(c); + + c = hex_lower(isl_local_space_copy(ls), h); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_lower_right(isl_local_space_copy(ls), h, s0, dl, duh); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_upper_left(isl_local_space_copy(ls), h, dl); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_upper_right(isl_local_space_copy(ls), h, s0, du, dlh, duh); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_upper(ls); + bset = isl_basic_set_add_constraint(bset, c); + + return isl_set_from_basic_set(bset); +} + +/* Name of the ts-space. + */ +static const char *ts_space_name = "ts"; + +/* Construct and return the space ts[t, s]. + */ +static __isl_give isl_space *construct_ts_space(isl_ctx *ctx) +{ + isl_space *s; + + s = isl_space_set_alloc(ctx, 0, 2); + s = isl_space_set_tuple_name(s, isl_dim_set, ts_space_name); + + return s; +} + +/* Name of the local ts-space. + */ +static const char *local_ts_space_name = "local_ts"; + +/* Construct and return the space local_ts[t, s]. + */ +static __isl_give isl_space *construct_local_ts_space(isl_ctx *ctx) +{ + isl_space *s; + + s = isl_space_set_alloc(ctx, 0, 2); + s = isl_space_set_tuple_name(s, isl_dim_set, local_ts_space_name); + + return s; +} + +/* Compute the total size of a tile for the space dimensions, + * i.e., those corresponding to the child node + * of the input pattern. + * If S_0 is the original tile size in the first space dimension, + * then the first entry of "space_sizes" is equal to + * W = 2*S_0 + floor(d_l h) + floor(d_u h). + * The remaining entries are the same as in the original tile sizes. + * "tile_sizes" contains the original tile sizes, including + * the tile size corresponding to the parent node. + * "dlh" is equal to floor(d_l h). + * "duh" is equal to floor(d_u h). + */ +static __isl_give isl_multi_val *compute_space_sizes( + __isl_keep isl_multi_val *tile_sizes, + __isl_keep isl_val *dlh, __isl_keep isl_val *duh) +{ + isl_val *size; + isl_multi_val *space_sizes; + + space_sizes = isl_multi_val_copy(tile_sizes); + space_sizes = isl_multi_val_factor_range(space_sizes); + size = isl_multi_val_get_val(space_sizes, 0); + size = isl_val_mul_ui(size, 2); + size = isl_val_add(size, isl_val_copy(duh)); + size = isl_val_add(size, isl_val_copy(dlh)); + space_sizes = isl_multi_val_set_val(space_sizes, 0, size); + + return space_sizes; +} + +/* Compute the offset of phase 1 with respect to phase 0 + * in the ts-space ("space"). + * In particular, return + * + * ts[st, s0 + duh] + */ +static __isl_give isl_multi_val *compute_phase_shift( + __isl_keep isl_space *space, __isl_keep isl_val *st, + __isl_keep isl_val *s0, __isl_keep isl_val *duh) +{ + isl_val *v; + isl_multi_val *phase_shift; + + phase_shift = isl_multi_val_zero(isl_space_copy(space)); + phase_shift = isl_multi_val_set_val(phase_shift, 0, isl_val_copy(st)); + v = isl_val_add(isl_val_copy(duh), isl_val_copy(s0)); + phase_shift = isl_multi_val_set_val(phase_shift, 1, v); + + return phase_shift; +} + +/* Return the function + * + * ts[t, s] -> floor(t/(2 * st)) + * + * representing the time tile. + * "space" is the space ts[t, s]. + */ +static __isl_give isl_aff *compute_time_tile(__isl_keep isl_space *space, + __isl_keep isl_val *st) +{ + isl_val *v; + isl_aff *t; + isl_local_space *ls; + + ls = isl_local_space_from_space(isl_space_copy(space)); + t = isl_aff_var_on_domain(ls, isl_dim_set, 0); + v = isl_val_mul_ui(isl_val_copy(st), 2); + t = isl_aff_floor(isl_aff_scale_down_val(t, v)); + + return t; +} + +/* Compute a shift in the space dimension for tiles + * at time tile T = floor(t/(2 * S_t)) + * such that they align to a multiple of the total space tile dimension W. + * In particular, compute + * + * ts[t, s] -> s + (-(2 * shift_s)*T) % W + * + * where shift_s is the shift of phase 1 with respect to phase 0 + * in the space dimension (the first element of "phase_shift"). + * W is stored in the first element of "space_sizes". + * "time_tile" is the function + * + * ts[t, s] -> floor(t/(2 * S_T)) + * + * Since phase 1 is shifted by shift_s with respect to phase 0, + * the next line of phase 0 (at T+1) is shifted by 2*shift_s + * with respect to the previous line (at T). + * A shift of -(2 * shift_s)*T therefore allows the basic pattern + * (which starts at 0) to be applied. + * However, this shift will be used to obtain the tile coordinate + * in the first space dimension and if the original values + * in the space dimension are non-negative, then the shift should + * not make them negative. Moreover, the shift should be as minimal + * as possible. + * Since the pattern repeats itself with a period of W in the space + * dimension, the shift can be replaced by (-(2 * shift_s)*T) % W. + */ +static __isl_give isl_aff *compute_shift_space(__isl_keep isl_aff *time_tile, + __isl_keep isl_multi_val *space_sizes, + __isl_keep isl_multi_val *phase_shift) +{ + isl_val *v; + isl_aff *s, *t; + isl_local_space *ls; + + ls = isl_local_space_from_space(isl_aff_get_domain_space(time_tile)); + t = isl_aff_copy(time_tile); + v = isl_val_mul_ui(isl_multi_val_get_val(phase_shift, 1), 2); + v = isl_val_neg(v); + t = isl_aff_scale_val(t, v); + v = isl_multi_val_get_val(space_sizes, 0); + t = isl_aff_mod_val(t, v); + s = isl_aff_var_on_domain(ls, isl_dim_set, 1); + s = isl_aff_add(s, t); + + return s; +} + +/* Give the phase_shift ts[S_t, S_0 + floor(d_u h)], + * compute a function that applies the shift, i.e., + * + * ts[t, s] -> ts[t + S_t, s + S_0 + floor(d_u h)], + */ +static __isl_give isl_multi_aff *compute_shift_phase( + __isl_keep isl_multi_val *phase_shift) +{ + isl_space *space; + isl_multi_aff *shift; + + space = isl_multi_val_get_space(phase_shift); + shift = isl_multi_aff_multi_val_on_space(space, + isl_multi_val_copy(phase_shift)); + space = isl_multi_aff_get_space(shift); + shift = isl_multi_aff_add(shift, isl_multi_aff_identity(space)); + + return shift; +} + +/* Compute a mapping from the ts-space to the local coordinates + * within each tile. In particular, compute + * + * ts[t, s] -> local_ts[t % (2 S_t), (s + (-(2 * shift_s)*T) % W) % W] + * + * "ts" is the space ts[t, s] + * "local_ts" is the space local_ts[t, s] + * "shift_space" is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W + * "st" is the tile size in the time dimension S_t. + * The first element of "space_sizes" is equal to W. + */ +static __isl_give isl_multi_aff *compute_localize( + __isl_keep isl_space *local_ts, __isl_keep isl_aff *shift_space, + __isl_keep isl_val *st, __isl_keep isl_multi_val *space_sizes) +{ + isl_val *v; + isl_space *space; + isl_aff *s, *t; + isl_multi_aff *localize; + + space = isl_aff_get_domain_space(shift_space); + local_ts = isl_space_copy(local_ts); + space = isl_space_map_from_domain_and_range(space, local_ts); + localize = isl_multi_aff_identity(space); + t = isl_multi_aff_get_aff(localize, 0); + v = isl_val_mul_ui(isl_val_copy(st), 2); + t = isl_aff_mod_val(t, v); + localize = isl_multi_aff_set_aff(localize, 0, t); + s = isl_aff_copy(shift_space); + v = isl_multi_val_get_val(space_sizes, 0); + s = isl_aff_mod_val(s, v); + localize = isl_multi_aff_set_aff(localize, 1, s); + + return localize; +} + +/* Set the project_ts field of "tiling". + * + * This field projects the space of the input schedule to the ts-space. + * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0]. + */ +static __isl_give ppcg_ht_tiling *ppcg_ht_tiling_set_project_ts( + __isl_take ppcg_ht_tiling *tiling) +{ + int n; + isl_space *space; + isl_multi_aff *project; + + if (!tiling) + return NULL; + + space = ppcg_ht_tiling_get_input_space(tiling); + n = isl_space_dim(space, isl_dim_set); + project = isl_multi_aff_project_out_map(space, isl_dim_set, 2, n - 2); + project = isl_multi_aff_set_tuple_name(project, + isl_dim_out, ts_space_name); + if (!project) + return ppcg_ht_tiling_free(tiling); + + tiling->project_ts = project; + + return tiling; +} + +/* Construct a hybrid tiling description from bounds on the dependence + * distances "bounds". + * "input_node" points to the original parent node. + * "input_schedule" is the combined schedule of the parent and child + * node in the input. + * "tile_sizes" are the original, user specified tile sizes. + */ +static __isl_give ppcg_ht_tiling *ppcg_ht_bounds_construct_tiling( + __isl_take ppcg_ht_bounds *bounds, + __isl_keep isl_schedule_node *input_node, + __isl_keep isl_multi_union_pw_aff *input_schedule, + __isl_keep isl_multi_val *tile_sizes) +{ + isl_ctx *ctx; + ppcg_ht_tiling *tiling; + isl_multi_val *space_sizes, *phase_shift; + isl_aff *time_tile, *shift_space; + isl_multi_aff *localize; + isl_val *h, *duh, *dlh; + isl_val *st, *s0, *du, *dl; + isl_space *ts, *local_ts; + + if (!bounds || !input_node || !input_schedule || !tile_sizes) + goto error; + + ctx = isl_multi_union_pw_aff_get_ctx(input_schedule); + tiling = isl_calloc_type(ctx, struct ppcg_ht_tiling); + if (!tiling) + goto error; + tiling->ref = 1; + + st = isl_multi_val_get_val(tile_sizes, 0); + h = isl_val_sub_ui(isl_val_copy(st), 1); + s0 = isl_multi_val_get_val(tile_sizes, 1); + du = ppcg_ht_bounds_get_upper(bounds); + dl = ppcg_ht_bounds_get_lower(bounds, 0); + + duh = isl_val_floor(isl_val_mul(isl_val_copy(du), isl_val_copy(h))); + dlh = isl_val_floor(isl_val_mul(isl_val_copy(dl), isl_val_copy(h))); + + ts = construct_ts_space(ctx); + local_ts = construct_local_ts_space(ctx); + + space_sizes = compute_space_sizes(tile_sizes, dlh, duh); + phase_shift = compute_phase_shift(ts, st, s0, duh); + time_tile = compute_time_tile(ts, st); + shift_space = compute_shift_space(time_tile, space_sizes, phase_shift); + localize = compute_localize(local_ts, shift_space, st, space_sizes); + isl_space_free(ts); + + tiling->input_node = isl_schedule_node_copy(input_node); + tiling->input_schedule = isl_multi_union_pw_aff_copy(input_schedule); + tiling->space_sizes = space_sizes; + tiling->bounds = bounds; + tiling->local_time = isl_multi_aff_get_aff(localize, 0); + tiling->hex = compute_hexagon(local_ts, h, s0, dl, du, dlh, duh); + tiling->hex = isl_set_preimage_multi_aff(tiling->hex, localize); + tiling->time_tile = time_tile; + tiling->shift_space = shift_space; + tiling->shift_phase = compute_shift_phase(phase_shift); + isl_multi_val_free(phase_shift); + + isl_val_free(duh); + isl_val_free(dlh); + isl_val_free(du); + isl_val_free(dl); + isl_val_free(s0); + isl_val_free(st); + isl_val_free(h); + + if (!tiling->input_schedule || !tiling->local_time || !tiling->hex || + !tiling->shift_space || !tiling->shift_phase) + return ppcg_ht_tiling_free(tiling); + + tiling = ppcg_ht_tiling_set_project_ts(tiling); + + return tiling; +error: + ppcg_ht_bounds_free(bounds); + return NULL; +} + +/* Are all members of the band node "node" coincident? + */ +static isl_bool all_coincident(__isl_keep isl_schedule_node *node) +{ + int i, n; + + n = isl_schedule_node_band_n_member(node); + for (i = 0; i < n; ++i) { + isl_bool c; + + c = isl_schedule_node_band_member_get_coincident(node, i); + if (c < 0 || !c) + return c; + } + + return isl_bool_true; +} + +/* Does "node" satisfy the properties of the inner node in the input + * pattern for hybrid tiling? + * That is, is it a band node with only coincident members, of which + * there is at least one? + */ +static isl_bool has_child_properties(__isl_keep isl_schedule_node *node) +{ + if (!node) + return isl_bool_error; + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return isl_bool_false; + if (isl_schedule_node_band_n_member(node) < 1) + return isl_bool_false; + return all_coincident(node); +} + +/* Does "node" satisfy the properties of the outer node in the input + * pattern for hybrid tiling? + * That is, is it a band node with a single member? + */ +static isl_bool has_parent_properties(__isl_keep isl_schedule_node *node) +{ + if (!node) + return isl_bool_error; + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return isl_bool_false; + if (isl_schedule_node_band_n_member(node) != 1) + return isl_bool_false; + return isl_bool_true; +} + +/* Does the parent of "node" satisfy the input patttern for hybrid tiling? + * That is, does "node" satisfy the properties of the inner node and + * does the parent of "node" satisfy the properties of the outer node? + */ +isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node) +{ + isl_bool has_pattern; + + has_pattern = has_child_properties(node); + if (has_pattern < 0 || !has_pattern) + return has_pattern; + + node = isl_schedule_node_copy(node); + node = isl_schedule_node_parent(node); + has_pattern = has_parent_properties(node); + isl_schedule_node_free(node); + + return has_pattern; +} + +/* Does "node" satisfy the input patttern for hybrid tiling? + * That is, does "node" satisfy the properties of the outer node and + * does the child of "node" satisfy the properties of the inner node? + */ +isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node) +{ + isl_bool has_pattern; + + has_pattern = has_parent_properties(node); + if (has_pattern < 0 || !has_pattern) + return has_pattern; + + node = isl_schedule_node_get_child(node, 0); + has_pattern = has_child_properties(node); + isl_schedule_node_free(node); + + return has_pattern; +} + +/* Check that "node" satisfies the input pattern for hybrid tiling. + * Error out if it does not. + */ +static isl_stat check_input_pattern(__isl_keep isl_schedule_node *node) +{ + isl_bool has_pattern; + + has_pattern = ppcg_ht_has_input_pattern(node); + if (has_pattern < 0) + return isl_stat_error; + if (!has_pattern) + isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid, + "invalid input pattern for hybrid tiling", + return isl_stat_error); + + return isl_stat_ok; +} + +/* Extract the input schedule from "node", i.e., the product + * of the partial schedules of the parent and child nodes + * in the input pattern. + */ +static __isl_give isl_multi_union_pw_aff *extract_input_schedule( + __isl_keep isl_schedule_node *node) +{ + isl_multi_union_pw_aff *partial, *partial2; + + partial = isl_schedule_node_band_get_partial_schedule(node); + node = isl_schedule_node_get_child(node, 0); + partial2 = isl_schedule_node_band_get_partial_schedule(node); + isl_schedule_node_free(node); + + return isl_multi_union_pw_aff_range_product(partial, partial2); +} + +/* Collect all dependences from "scop" that are relevant for performing + * hybrid tiling on "node" and its child and map them to the schedule + * space of this pair of nodes. + * + * In case live range reordering is not used, + * the flow and the false dependences are collected. + * In case live range reordering is used, + * the flow and the forced dependences are collected, as well + * as the order dependences that are adjacent to non-local + * flow dependences. + * + * In all cases, only dependences that map to the same instance + * of the outer part of the schedule are considered. + */ +static __isl_give isl_map *collect_deps(struct ppcg_scop *scop, + __isl_keep isl_schedule_node *node) +{ + isl_space *space; + isl_multi_union_pw_aff *prefix, *partial; + isl_union_map *flow, *other, *dep, *umap; + isl_map *map; + + prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); + partial = extract_input_schedule(node); + space = isl_multi_union_pw_aff_get_space(partial); + + flow = isl_union_map_copy(scop->dep_flow); + flow = isl_union_map_eq_at_multi_union_pw_aff(flow, + isl_multi_union_pw_aff_copy(prefix)); + if (!scop->options->live_range_reordering) { + other = isl_union_map_copy(scop->dep_false); + other = isl_union_map_eq_at_multi_union_pw_aff(other, prefix); + } else { + isl_union_map *local, *non_local, *order, *adj; + isl_union_set *domain, *range; + + other = isl_union_map_copy(scop->dep_forced); + other = isl_union_map_eq_at_multi_union_pw_aff(other, + isl_multi_union_pw_aff_copy(prefix)); + local = isl_union_map_copy(flow); + local = isl_union_map_eq_at_multi_union_pw_aff(local, + isl_multi_union_pw_aff_copy(partial)); + non_local = isl_union_map_copy(flow); + non_local = isl_union_map_subtract(non_local, local); + + order = isl_union_map_copy(scop->dep_order); + order = isl_union_map_eq_at_multi_union_pw_aff(order, prefix); + adj = isl_union_map_copy(order); + domain = isl_union_map_domain(isl_union_map_copy(non_local)); + domain = isl_union_set_coalesce(domain); + adj = isl_union_map_intersect_range(adj, domain); + other = isl_union_map_union(other, adj); + + adj = order; + range = isl_union_map_range(non_local); + range = isl_union_set_coalesce(range); + adj = isl_union_map_intersect_domain(adj, range); + other = isl_union_map_union(other, adj); + } + dep = isl_union_map_union(flow, other); + + umap = isl_union_map_from_multi_union_pw_aff(partial); + dep = isl_union_map_apply_domain(dep, isl_union_map_copy(umap)); + dep = isl_union_map_apply_range(dep, umap); + + space = isl_space_map_from_set(space); + map = isl_union_map_extract_map(dep, space); + isl_union_map_free(dep); + + map = isl_map_coalesce(map); + + return map; +} + +/* Given a constraint of the form + * + * a i_0 + b i_1 >= 0 + * or + * a i_0 + b i_1 = 0 + * + * use it to update one or both of the non-negative bounds + * in "list" = (min, max) such that + * + * i_1 >= -min i_0 + * and + * i_1 <= max i_0 + * + * If b = 0, then the constraint cannot be used. + * Otherwise, the constraint is equivalent to + * + * sgn(b) i_1 >= - a/abs(b) i_0 + * i.e., + * i_1 >= - a/abs(b) i_0 + * or + * i_1 <= a/abs(b) i_0 + * + * Set the first or second element of "list" to max(0, a/abs(b)), + * according to the sign of "b". Or set both in case the constraint + * is an equality, taking into account the sign change. + */ +static __isl_give isl_val_list *list_set_min_max(__isl_take isl_val_list *list, + __isl_keep isl_constraint *c) +{ + isl_val *a, *b; + int sign; + int pos; + isl_bool eq, is_zero, is_neg; + + eq = isl_constraint_is_equality(c); + if (eq < 0) + return isl_val_list_free(list); + + b = isl_constraint_get_coefficient_val(c, isl_dim_set, 1); + is_zero = isl_val_is_zero(b); + if (is_zero == isl_bool_true) { + isl_val_free(b); + return list; + } + a = isl_constraint_get_coefficient_val(c, isl_dim_set, 0); + sign = isl_val_sgn(b); + b = isl_val_abs(b); + a = isl_val_div(a, b); + + if (eq) + b = isl_val_copy(a); + + pos = sign > 0 ? 0 : 1; + is_neg = isl_val_is_neg(a); + if (is_neg == isl_bool_true) + a = isl_val_set_si(a, 0); + list = isl_val_list_set_val(list, pos, a); + + if (!eq) + return is_neg < 0 ? isl_val_list_free(list) : list; + + pos = 1 - pos; + a = isl_val_neg(b); + is_neg = isl_val_is_neg(a); + if (is_neg == isl_bool_true) + a = isl_val_set_si(a, 0); + list = isl_val_list_set_val(list, pos, a); + + return is_neg < 0 ? isl_val_list_free(list) : list; +} + +/* If constraint "c" passes through the origin, then try and use it + * to update the non-negative bounds in "list" = (min, max) such that + * + * i_1 >= -min i_0 + * and + * i_1 <= max i_0 + */ +static isl_stat set_min_max(__isl_take isl_constraint *c, void *user) +{ + isl_val *v; + isl_val_list **list = user; + isl_bool is_zero; + + v = isl_constraint_get_constant_val(c); + is_zero = isl_val_is_zero(v); + isl_val_free(v); + + if (is_zero == isl_bool_true) + *list = list_set_min_max(*list, c); + + isl_constraint_free(c); + return is_zero < 0 ? isl_stat_error : isl_stat_ok; +} + +/* Given a set of dependence distance vectors "dist", compute + * pair of non-negative bounds min and max such that + * + * d_pos >= -min d_0 + * and + * d_pos <= max d_0 + * + * and return the pair (min, max). + * If no bound can be found in either direction, then the bound + * is replaced by NaN. + * + * The dependence distances are first projected onto the (d_0, d_pos). + * Then the zero dependence distance is added and the convex hull is computed. + * Finally, the bounds are extracted from the constraints of the convex hull + * that pass through the origin. + */ +static __isl_give isl_val_list *min_max_dist(__isl_keep isl_set *dist, int pos) +{ + isl_space *space; + isl_basic_set *hull; + int dim; + isl_ctx *ctx; + isl_val *nan; + isl_val_list *list; + + ctx = isl_set_get_ctx(dist); + nan = isl_val_nan(ctx); + list = isl_val_list_alloc(ctx, 2); + list = isl_val_list_add(list, isl_val_copy(nan)); + list = isl_val_list_add(list, nan); + + dist = isl_set_copy(dist); + dim = isl_set_dim(dist, isl_dim_set); + if (dist && pos >= dim) + isl_die(ctx, isl_error_internal, "position out of bounds", + dist = isl_set_free(dist)); + dist = isl_set_project_out(dist, isl_dim_set, pos + 1, dim - (pos + 1)); + dist = isl_set_project_out(dist, isl_dim_set, 1, pos - 1); + + space = isl_set_get_space(dist); + dist = isl_set_union(dist, isl_set_from_point(isl_point_zero(space))); + dist = isl_set_remove_divs(dist); + hull = isl_set_convex_hull(dist); + + if (isl_basic_set_foreach_constraint(hull, &set_min_max, &list) < 0) + list = isl_val_list_free(list); + isl_basic_set_free(hull); + + return list; +} + +/* Given a schedule node "node" that, together with its child, + * satisfies the input pattern for hybrid tiling, compute bounds + * on the relative dependence distances of the child node with + * respect to the parent node. These bounds are needed to + * construct a hybrid tiling. + * + * First all relevant dependences are collected and mapped + * to the schedule space of the pair of nodes. Then, the + * dependence distances are computed in this space. + * + * These dependence distances are then projected onto a two-dimensional + * space consisting of the single schedule dimension of the outer node + * and one of the schedule dimensions of the inner node. + * The maximal and minimal relative dependence distances are extracted + * from these projections. + * This process is repeated for each of the schedule dimensions + * of the inner node. For the first dimension, both minimal and + * maximal relative dependence distances are stored in the result. + * For the other dimensions, only the minimal relative dependence + * distance is stored. + */ +__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop, + __isl_keep isl_schedule_node *node) +{ + ppcg_ht_bounds *bnd; + isl_space *space; + isl_map *map; + isl_set *dist; + isl_val_list *pair; + isl_schedule_node *child; + int n; + int i, dim; + + if (!scop || !node || check_input_pattern(node) < 0) + return NULL; + + child = isl_schedule_node_get_child(node, 0); + space = isl_schedule_node_band_get_space(child); + dim = isl_schedule_node_band_n_member(child); + isl_schedule_node_free(child); + bnd = ppcg_ht_bounds_alloc(space); + if (!bnd) + return NULL; + + map = collect_deps(scop, node); + + dist = isl_map_deltas(map); + n = isl_set_dim(dist, isl_dim_param); + dist = isl_set_project_out(dist, isl_dim_param, 0, n); + + pair = min_max_dist(dist, 1); + bnd = ppcg_ht_bounds_set_lower(bnd, 0, isl_val_list_get_val(pair, 0)); + bnd = ppcg_ht_bounds_set_upper(bnd, isl_val_list_get_val(pair, 1)); + isl_val_list_free(pair); + + for (i = 1; i < dim; ++i) { + pair = min_max_dist(dist, 1 + i); + bnd = ppcg_ht_bounds_set_lower(bnd, i, + isl_val_list_get_val(pair, 0)); + isl_val_list_free(pair); + } + + isl_set_free(dist); + + return bnd; +} + +/* Check if all the fields of "phase" are valid, freeing "phase" + * if they are not. + */ +static __isl_give ppcg_ht_phase *check_phase(__isl_take ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + if (!phase->tiling || !phase->local_time || + !phase->shift_space || !phase->domain) + return ppcg_ht_phase_free(phase); + + return phase; +} + +/* Construct a ppcg_ht_phase object, that simply copies + * information from "tiling". + * That is, the result is defined over the "ts" space and + * corresponds to phase 1. + */ +static __isl_give ppcg_ht_phase *construct_phase( + __isl_keep ppcg_ht_tiling *tiling) +{ + isl_ctx *ctx; + ppcg_ht_phase *phase; + + if (!tiling) + return NULL; + + ctx = ppcg_ht_tiling_get_ctx(tiling); + phase = isl_calloc_type(ctx, struct ppcg_ht_phase); + if (!phase) + return NULL; + phase->tiling = ppcg_ht_tiling_copy(tiling); + phase->time_tile = isl_aff_copy(tiling->time_tile); + phase->local_time = isl_aff_copy(tiling->local_time); + phase->shift_space = isl_aff_copy(tiling->shift_space); + phase->domain = isl_set_copy(tiling->hex); + + return check_phase(phase); +} + +/* Align the parameters of the elements of "phase" to those of "space". + */ +static __isl_give ppcg_ht_phase *phase_align_params( + __isl_take ppcg_ht_phase *phase, __isl_take isl_space *space) +{ + if (!phase) + goto error; + + phase->time_tile = isl_aff_align_params(phase->time_tile, + isl_space_copy(space)); + phase->local_time = isl_aff_align_params(phase->local_time, + isl_space_copy(space)); + phase->shift_space = isl_aff_align_params(phase->shift_space, + isl_space_copy(space)); + phase->domain = isl_set_align_params(phase->domain, space); + + return check_phase(phase); +error: + isl_space_free(space); + return NULL; +} + +/* Pull back "phase" over "ma". + * That is, take a phase defined over the range of "ma" and + * turn it into a phase defined over the domain of "ma". + */ +static __isl_give ppcg_ht_phase *pullback_phase(__isl_take ppcg_ht_phase *phase, + __isl_take isl_multi_aff *ma) +{ + phase = phase_align_params(phase, isl_multi_aff_get_space(ma)); + if (!phase) + goto error; + + phase->time_tile = isl_aff_pullback_multi_aff(phase->time_tile, + isl_multi_aff_copy(ma)); + phase->local_time = isl_aff_pullback_multi_aff(phase->local_time, + isl_multi_aff_copy(ma)); + phase->shift_space = isl_aff_pullback_multi_aff(phase->shift_space, + isl_multi_aff_copy(ma)); + phase->domain = isl_set_preimage_multi_aff(phase->domain, ma); + + return check_phase(phase); +error: + isl_multi_aff_free(ma); + return NULL; +} + +/* Pullback "phase" over phase->tiling->shift_phase, which shifts + * phase 0 to phase 1. The pullback therefore takes a phase 1 + * description and turns it into a phase 0 description. + */ +static __isl_give ppcg_ht_phase *shift_phase(__isl_take ppcg_ht_phase *phase) +{ + ppcg_ht_tiling *tiling; + + if (!phase) + return NULL; + + tiling = phase->tiling; + return pullback_phase(phase, isl_multi_aff_copy(tiling->shift_phase)); +} + +/* Take a "phase" defined over the ts-space and plug in the projection + * from the input schedule space to the ts-space. + * The result is then defined over this input schedule space. + */ +static __isl_give ppcg_ht_phase *lift_phase(__isl_take ppcg_ht_phase *phase) +{ + ppcg_ht_tiling *tiling; + + if (!phase) + return NULL; + + tiling = phase->tiling; + return pullback_phase(phase, isl_multi_aff_copy(tiling->project_ts)); +} + +/* Compute the shift that should be added to the space band + * in order to be able to apply rectangular tiling to the space. + * Store the shift in phase->space_shift. + * + * In the first dimension, it is equal to shift_space - s. + * For phase 1, this results in + * + * (-(2 * shift_s)*T) % W + * + * In phase 0, the "s" in shift_space has been replaced by "s + shift_s", + * so the result is + * + * shift_s + (-(2 * shift_s)*T) % W + * + * In the other dimensions, the shift is equal to + * + * dl_i * local_time. + */ +static __isl_give ppcg_ht_phase *compute_space_shift( + __isl_take ppcg_ht_phase *phase) +{ + int i, n; + isl_space *space; + isl_local_space *ls; + isl_aff *aff, *s; + isl_multi_aff *space_shift; + + if (!phase) + return NULL; + + space = ppcg_ht_phase_get_input_space(phase); + space = isl_space_unwrap(space); + space = isl_space_range_map(space); + + space_shift = isl_multi_aff_zero(space); + aff = isl_aff_copy(phase->shift_space); + ls = isl_local_space_from_space(isl_aff_get_domain_space(aff)); + s = isl_aff_var_on_domain(ls, isl_dim_set, 1); + aff = isl_aff_sub(aff, s); + space_shift = isl_multi_aff_set_aff(space_shift, 0, aff); + + n = isl_multi_aff_dim(space_shift, isl_dim_out); + for (i = 1; i < n; ++i) { + isl_val *v; + isl_aff *time; + + v = ppcg_ht_bounds_get_lower(phase->tiling->bounds, i); + time = isl_aff_copy(phase->local_time); + time = isl_aff_scale_val(time, v); + space_shift = isl_multi_aff_set_aff(space_shift, i, time); + } + + if (!space_shift) + return ppcg_ht_phase_free(phase); + phase->space_shift = space_shift; + return phase; +} + +/* Compute the space tiling and store the result in phase->space_tile. + * The space tiling is of the form + * + * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size] + */ +static __isl_give ppcg_ht_phase *compute_space_tile( + __isl_take ppcg_ht_phase *phase) +{ + isl_space *space; + isl_multi_val *space_sizes; + isl_multi_aff *space_shift; + isl_multi_aff *tile; + + if (!phase) + return NULL; + + space = ppcg_ht_phase_get_input_space(phase); + space = isl_space_unwrap(space); + tile = isl_multi_aff_range_map(space); + space_shift = isl_multi_aff_copy(phase->space_shift); + tile = isl_multi_aff_add(space_shift, tile); + space_sizes = isl_multi_val_copy(phase->tiling->space_sizes); + tile = isl_multi_aff_scale_down_multi_val(tile, space_sizes); + tile = isl_multi_aff_floor(tile); + + if (!tile) + return ppcg_ht_phase_free(phase); + phase->space_tile = tile; + return phase; +} + +/* Construct a representation for one of the two phase for hybrid tiling + * "tiling". If "shift" is not set, then the phase is constructed + * directly from the hexagonal tile shape in "tiling", which represents + * the phase-1 tiles. If "shift" is set, then this tile shape is shifted + * back over tiling->shift_phase to obtain the phase-0 tiles. + * + * First copy data from "tiling", then optionally shift the phase and + * finally move the tiling from the "ts" space of "tiling" to + * the space of the input pattern. + * + * After the basic phase has been computed, also compute + * the corresponding space shift. + */ +static __isl_give ppcg_ht_phase *ppcg_ht_tiling_compute_phase( + __isl_keep ppcg_ht_tiling *tiling, int shift) +{ + ppcg_ht_phase *phase; + + phase = construct_phase(tiling); + if (shift) + phase = shift_phase(phase); + phase = lift_phase(phase); + + phase = compute_space_shift(phase); + phase = compute_space_tile(phase); + + return phase; +} + +/* Consruct a function that is equal to the time tile of "phase0" + * on the domain of "phase0" and equal to the time tile of "phase1" + * on the domain of "phase1". + * The two domains are assumed to form a partition of the input + * schedule space. + */ +static __isl_give isl_pw_multi_aff *combine_time_tile( + __isl_keep ppcg_ht_phase *phase0, __isl_keep ppcg_ht_phase *phase1) +{ + isl_aff *T; + isl_pw_aff *time, *time1; + + if (!phase0 || !phase1) + return NULL; + + T = isl_aff_copy(phase0->time_tile); + time = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase0), T); + + T = isl_aff_copy(phase1->time_tile); + time1 = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase1), T); + + time = isl_pw_aff_union_add(time, time1); + + return isl_pw_multi_aff_from_pw_aff(time); +} + +/* Name used in mark nodes that contain a pointer to a ppcg_ht_phase. + */ +static char *ppcg_phase_name = "phase"; + +/* Does "id" contain a pointer to a ppcg_ht_phase? + * That is, is it called "phase"? + */ +static isl_bool is_phase_id(__isl_keep isl_id *id) +{ + const char *name; + + name = isl_id_get_name(id); + if (!name) + return isl_bool_error; + + return !strcmp(name, ppcg_phase_name); +} + +/* Given a mark node with an identifier that points to a ppcg_ht_phase, + * extract this ppcg_ht_phase pointer. + */ +__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark( + __isl_keep isl_schedule_node *node) +{ + isl_bool is_phase; + isl_id *id; + void *p; + + if (!node) + return NULL; + if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) + isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, + "not a phase mark", return NULL); + + id = isl_schedule_node_mark_get_id(node); + is_phase = is_phase_id(id); + p = isl_id_get_user(id); + isl_id_free(id); + + if (is_phase < 0) + return NULL; + if (!is_phase) + isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, + "not a phase mark", return NULL); + + return p; +} + +/* Insert a mark node at "node" holding a pointer to "phase". + */ +static __isl_give isl_schedule_node *insert_phase( + __isl_take isl_schedule_node *node, __isl_take ppcg_ht_phase *phase) +{ + isl_ctx *ctx; + isl_id *id; + + if (!node) + goto error; + ctx = isl_schedule_node_get_ctx(node); + id = isl_id_alloc(ctx, ppcg_phase_name, phase); + if (!id) + goto error; + id = isl_id_set_free_user(id, &ppcg_ht_phase_free_wrap); + node = isl_schedule_node_insert_mark(node, id); + + return node; +error: + ppcg_ht_phase_free(phase); + isl_schedule_node_free(node); + return NULL; +} + +/* Construct a mapping from the elements of the original pair of bands + * to which tiling was applied that belong to a tile of "phase" + * to that tile, preserving the values for the outer bands. + * + * The mapping is of the form + * + * [[outer] -> [P -> C]] -> [[outer] -> [tile]] + * + * where tile is defined by a concatenation of the time_tile and + * the space_tile. + */ +static __isl_give isl_map *construct_tile_map(__isl_keep ppcg_ht_phase *phase) +{ + int depth; + isl_space *space; + isl_multi_aff *ma; + isl_multi_aff *tiling; + isl_map *el2tile; + + depth = isl_schedule_node_get_schedule_depth( + phase->tiling->input_node); + space = isl_aff_get_space(phase->time_tile); + space = isl_space_params(space); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, depth); + space = isl_space_map_from_set(space); + ma = isl_multi_aff_identity(space); + + tiling = isl_multi_aff_flat_range_product( + isl_multi_aff_from_aff(isl_aff_copy(phase->time_tile)), + isl_multi_aff_copy(phase->space_tile)); + el2tile = isl_map_from_multi_aff(tiling); + el2tile = isl_map_intersect_domain(el2tile, + isl_set_copy(phase->domain)); + el2tile = isl_map_product(isl_map_from_multi_aff(ma), el2tile); + + return el2tile; +} + +/* Return a description of the full tiles of "phase" at the point + * in the original schedule tree where the tiling was applied. + * + * First construct a mapping from the input schedule dimensions + * up to an including the original pair of bands to which hybrid tiling + * was applied to schedule dimensions in which this original pair + * has been replaced by the tiles. + * This mapping is of the form + * + * [[outer] -> [P -> C]] -> [[outer] -> [tile]] + * + * Apply this mapping to the set of all values for the input + * schedule dimensions and then apply its inverse. + * The result is the set of values for the input schedule dimensions + * that would map to any of the tiles. Subtracting from this set + * the set of values that are actually executed produces the set + * of values that belong to a tile but that are not executed. + * Mapping these back to the tiles produces a description of + * the partial tiles. Subtracting these from the set of all tiles + * produces a description of the full tiles in the form + * + * [[outer] -> [tile]] + */ +static __isl_give isl_set *compute_full_tile(__isl_keep ppcg_ht_phase *phase) +{ + isl_schedule_node *node; + isl_union_set *domain; + isl_union_map *prefix, *schedule; + isl_set *all, *partial, *all_el; + isl_map *tile2el, *el2tile; + isl_multi_union_pw_aff *mupa; + + el2tile = construct_tile_map(phase); + tile2el = isl_map_reverse(isl_map_copy(el2tile)); + + node = phase->tiling->input_node; + prefix = isl_schedule_node_get_prefix_schedule_union_map(node); + domain = isl_schedule_node_get_domain(node); + mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); + schedule = isl_union_map_from_multi_union_pw_aff(mupa); + schedule = isl_union_map_range_product(prefix, schedule); + all_el = isl_set_from_union_set(isl_union_set_apply(domain, schedule)); + all_el = isl_set_coalesce(all_el); + + all = isl_set_apply(isl_set_copy(all_el), isl_map_copy(el2tile)); + + partial = isl_set_copy(all); + partial = isl_set_apply(partial, tile2el); + partial = isl_set_subtract(partial, all_el); + partial = isl_set_apply(partial, el2tile); + + return isl_set_subtract(all, partial); +} + +/* Copy the AST loop types of the non-isolated part to those + * of the isolated part. + */ +static __isl_give isl_schedule_node *set_isolate_loop_type( + __isl_take isl_schedule_node *node) +{ + int i, n; + + n = isl_schedule_node_band_n_member(node); + for (i = 0; i < n; ++i) { + enum isl_ast_loop_type type; + + type = isl_schedule_node_band_member_get_ast_loop_type(node, i); + node = isl_schedule_node_band_member_set_isolate_ast_loop_type( + node, i, type); + } + + return node; +} + +/* If options->isolate_full_tiles is set, then mark the full tiles + * in "node" for isolation. The full tiles are derived from "phase". + * "node" may point to a part of the tiling, e.g., the space tiling. + * + * The full tiles are originally computed in the form + * + * [[outer] -> [tile]] + * + * However, the band that "node" points to may only contain + * subset of the tile dimensions. + * The description above is therefore treated as + * + * [[outer] -> [before; this; after]] + * + * before is of size "pos"; this is of size "dim"; and + * after is of size "out - pos - dim". + * The after part is first project out. Then the range is split + * into a before and this part and finally the before part is moved + * to the domain, resulting in + * + * [[outer; before] -> [this]] + * + * This description is then used as the isolate option. + * + * The AST loop type for the isolated part is set to be the same + * as that of the non-isolated part. + */ +static __isl_give isl_schedule_node *ppcg_ht_phase_isolate_full_tile_node( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node, + struct ppcg_options *options) +{ + int in, out, pos, depth, dim; + isl_space *space; + isl_multi_aff *ma1, *ma2; + isl_set *tile; + isl_map *map; + isl_set *set; + isl_union_set *opt; + + if (!options->isolate_full_tiles) + return node; + + depth = isl_schedule_node_get_schedule_depth(node); + dim = isl_schedule_node_band_n_member(node); + + tile = compute_full_tile(phase); + map = isl_set_unwrap(tile); + in = isl_map_dim(map, isl_dim_in); + out = isl_map_dim(map, isl_dim_out); + pos = depth - in; + map = isl_map_project_out(map, isl_dim_out, pos + dim, + out - (pos + dim)); + space = isl_space_range(isl_map_get_space(map)); + ma1 = isl_multi_aff_project_out_map(isl_space_copy(space), + isl_dim_set, pos, dim); + ma2 = isl_multi_aff_project_out_map(space, isl_dim_set, 0, pos); + ma1 = isl_multi_aff_range_product(ma1, ma2); + map = isl_map_apply_range(map, isl_map_from_multi_aff(ma1)); + map = isl_map_uncurry(map); + map = isl_map_flatten_domain(map); + set = isl_map_wrap(map); + set = isl_set_set_tuple_name(set, "isolate"); + + opt = isl_schedule_node_band_get_ast_build_options(node); + opt = isl_union_set_add_set(opt, set); + node = isl_schedule_node_band_set_ast_build_options(node, opt); + node = set_isolate_loop_type(node); + + return node; +} + +/* Insert a band node for performing the space tiling for "phase" at "node". + * In particular, insert a band node with partial schedule + * + * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size)] + * + * pulled back over the input schedule. + * "options" determines whether full tiles should be separated + * from partial tiles. + * + * The first tile dimension iterates over the hexagons in the same + * phase, which are independent by construction. The first dimension + * is therefore marked coincident. + * All dimensions are also marked for being generated as atomic loops + * because separation is usually not desirable on tile loops. + */ +static __isl_give isl_schedule_node *insert_space_tiling( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node, + struct ppcg_options *options) +{ + isl_multi_aff *space_tile; + isl_multi_union_pw_aff *mupa; + + if (!phase) + return isl_schedule_node_free(node); + + space_tile = isl_multi_aff_copy(phase->space_tile); + mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); + mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_tile); + node = isl_schedule_node_insert_partial_schedule(node, mupa); + node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); + node = ppcg_ht_phase_isolate_full_tile_node(phase, node, options); + node = isl_schedule_node_band_member_set_coincident(node, 0, 1); + + return node; +} + +/* Given a pointer "node" to (a copy of) the original child node + * in the input pattern, adjust its partial schedule such that + * it starts at zero within each tile. + * + * That is, replace "s" by (s + space_shift) % space_sizes. + */ +__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node) +{ + isl_multi_val *space_sizes; + isl_multi_aff *space_shift; + isl_multi_union_pw_aff *mupa; + + space_shift = isl_multi_aff_copy(phase->space_shift); + mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); + mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_shift); + node = isl_schedule_node_band_shift(node, mupa); + space_sizes = isl_multi_val_copy(phase->tiling->space_sizes); + node = isl_schedule_node_band_mod(node, space_sizes); + + return node; +} + +/* Does + * + * s0 > delta + 2 * {delta * h} - 1 + * + * hold? + */ +static isl_bool wide_enough(__isl_keep isl_val *s0, __isl_keep isl_val *delta, + __isl_keep isl_val *h) +{ + isl_val *v, *v2; + isl_bool ok; + + v = isl_val_mul(isl_val_copy(delta), isl_val_copy(h)); + v2 = isl_val_floor(isl_val_copy(v)); + v = isl_val_sub(v, v2); + v = isl_val_mul_ui(v, 2); + v = isl_val_add(v, isl_val_copy(delta)); + v = isl_val_sub_ui(v, 1); + ok = isl_val_gt(s0, v); + isl_val_free(v); + + return ok; +} + +/* Is the tile size specified by "sizes" wide enough in the first space + * dimension, i.e., the base of the hexagon? This ensures that, + * after hybrid tiling using "bounds" and these sizes, + * neighboring hexagons in the same phase are far enough apart + * that they do not depend on each other. + * The test is only meaningful if the bounds are valid. + * + * Let st be (half) the size in the time dimension and s0 the base + * size in the first space dimension. Let delta be the dependence + * distance in either positive or negative direction. In principle, + * it should be enough to have s0 + 1 > delta, i.e., s0 >= delta. + * However, in case of fractional delta, the tile is not extended + * with delta * (st - 1), but instead with floor(delta * (st - 1)). + * The condition therefore needs to be adjusted to + * + * s0 + 1 > delta + 2 {delta * (st - 1)} + * + * (with {} the fractional part) to account for the two slanted sides. + * The condition in the paper "Hybrid Hexagonal/Classical Tiling for GPUs" + * translates to + * + * s0 >= delta + {delta * (st - 1)} + * + * Since 1 > frac(delta * (st - 1)), this condition implies + * the condition above. + * + * The condition is checked for both directions. + */ +isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds, + __isl_keep isl_multi_val *sizes) +{ + isl_val *s0, *h; + isl_val *delta; + isl_bool ok; + + ok = ppcg_ht_bounds_is_valid(bounds); + if (ok < 0 || !ok) + return ok; + + h = isl_val_sub_ui(isl_multi_val_get_val(sizes, 0), 1); + s0 = isl_multi_val_get_val(sizes, 1); + + delta = ppcg_ht_bounds_get_lower(bounds, 0); + ok = wide_enough(s0, delta, h); + isl_val_free(delta); + + delta = ppcg_ht_bounds_get_upper(bounds); + if (ok == isl_bool_true) + ok = wide_enough(s0, delta, h); + isl_val_free(delta); + + isl_val_free(s0); + isl_val_free(h); + + return ok; +} + +/* Check that the tile will be wide enough in the first space + * dimension, i.e., the base of the hexagon. This ensures that + * neighboring hexagons in the same phase are far enough apart + * that they do not depend on each other. + * + * Error out if the condition fails to hold. + */ +static isl_stat check_width(__isl_keep ppcg_ht_bounds *bounds, + __isl_keep isl_multi_val *sizes) +{ + isl_bool ok; + + ok = ppcg_ht_bounds_supports_sizes(bounds, sizes); + + if (ok < 0) + return isl_stat_error; + if (!ok) + isl_die(isl_multi_val_get_ctx(sizes), isl_error_invalid, + "base of hybrid tiling hexagon not sufficiently wide", + return isl_stat_error); + + return isl_stat_ok; +} + +/* Given valid bounds on the relative dependence distances for + * the pair of nested nodes that "node" point to, as well as sufficiently + * wide tile sizes "sizes", insert the corresponding time and space tiling + * at "node", along with a pair of phase nodes that can be used + * to make further changes. + * The space of "sizes" should be the product of the spaces + * of the schedules of the pair of parent and child nodes. + * "options" determines whether full tiles should be separated + * from partial tiles. + * + * In particular, given an input of the form + * + * P - C - ... + * + * the output has the form + * + * /- F0 - M0 - CT0 - P - C - ... + * PT - seq + * \- F1 - M1 - CT1 - P - C - ... + * + * PT is the global time tiling. Within each of these tiles, + * two phases are executed in order. Within each phase, the schedule + * space is further subdivided into tiles through CT0 and CT1. + * The first dimension of each of these iterates over the hexagons + * within a phase and these are independent by construction. + * The F0 and F1 filters filter the statement instances that belong + * to the corresponding phase. The M0 and M1 marks contain a pointer + * to a ppcg_ht_phase object that can be used to perform further changes. + * + * After checking that input satisfies the requirements, + * a data structure is constructed that represents the tiling and + * two additional data structures are constructed for the two phases + * of the tiling. These are then used to define the filters F0 and F1 and + * combined to construct the time tiling PT. + * Then the time tiling node PT is inserted, followed by + * the sequence with the two filters, the CT space tiling nodes and + * the phase markers M. + */ +__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling( + __isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes, + __isl_take isl_schedule_node *node, struct ppcg_options *options) +{ + isl_ctx *ctx; + isl_union_set *phase0; + isl_union_set *phase1; + isl_multi_union_pw_aff *input, *dom_time; + isl_union_pw_multi_aff *upma; + isl_pw_multi_aff *time; + isl_union_set_list *phases; + ppcg_ht_tiling *tiling; + ppcg_ht_phase *phase_0; + ppcg_ht_phase *phase_1; + + if (!node || !sizes || !bounds) + goto error; + if (check_input_pattern(node) < 0 || check_width(bounds, sizes) < 0) + goto error; + + ctx = isl_schedule_node_get_ctx(node); + + input = extract_input_schedule(node); + + tiling = ppcg_ht_bounds_construct_tiling(bounds, node, input, sizes); + phase_0 = ppcg_ht_tiling_compute_phase(tiling, 1); + phase_1 = ppcg_ht_tiling_compute_phase(tiling, 0); + time = combine_time_tile(phase_0, phase_1); + ppcg_ht_tiling_free(tiling); + + upma = isl_union_pw_multi_aff_from_multi_union_pw_aff( + isl_multi_union_pw_aff_copy(input)); + phase0 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_0)); + phase0 = isl_union_set_preimage_union_pw_multi_aff(phase0, + isl_union_pw_multi_aff_copy(upma)); + phase1 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_1)); + phase1 = isl_union_set_preimage_union_pw_multi_aff(phase1, upma); + + phases = isl_union_set_list_alloc(ctx, 2); + phases = isl_union_set_list_add(phases, phase0); + phases = isl_union_set_list_add(phases, phase1); + + dom_time = isl_multi_union_pw_aff_apply_pw_multi_aff(input, time); + node = isl_schedule_node_insert_partial_schedule(node, dom_time); + + node = isl_schedule_node_child(node, 0); + + node = isl_schedule_node_insert_sequence(node, phases); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + node = insert_space_tiling(phase_0, node, options); + node = insert_phase(node, phase_0); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_next_sibling(node); + node = isl_schedule_node_child(node, 0); + node = insert_space_tiling(phase_1, node, options); + node = insert_phase(node, phase_1); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_parent(node); + + node = isl_schedule_node_parent(node); + + isl_multi_val_free(sizes); + return node; +error: + isl_multi_val_free(sizes); + isl_schedule_node_free(node); + ppcg_ht_bounds_free(bounds); + return NULL; +} + +/* Given a branch "node" that contains a sequence node with two phases + * of hybrid tiling as input, call "fn" on each of the two phase marker + * nodes. + * + * That is, the input is as follows + * + * /- F0 - M0 - ... + * ... - seq + * \- F1 - M1 - ... + * + * and "fn" is called on M0 and on M1. + */ +__isl_give isl_schedule_node *hybrid_tile_foreach_phase( + __isl_take isl_schedule_node *node, + __isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node, + void *user), void *user) +{ + int depth0, depth; + + depth0 = isl_schedule_node_get_tree_depth(node); + + while (node && + isl_schedule_node_get_type(node) != isl_schedule_node_sequence) + node = isl_schedule_node_child(node, 0); + + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + if (!node) + return NULL; + node = fn(node, user); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_next_sibling(node); + node = isl_schedule_node_child(node, 0); + if (!node) + return NULL; + node = fn(node, user); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_parent(node); + + depth = isl_schedule_node_get_tree_depth(node); + node = isl_schedule_node_ancestor(node, depth - depth0); + + return node; +} + +/* This function is called on each of the two phase marks + * in a hybrid tiling tree. + * Drop the phase mark at "node". + */ +static __isl_give isl_schedule_node *drop_phase_mark( + __isl_take isl_schedule_node *node, void *user) +{ + isl_id *id; + isl_bool is_phase; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) + return node; + + id = isl_schedule_node_mark_get_id(node); + is_phase = is_phase_id(id); + isl_id_free(id); + + if (is_phase < 0) + return isl_schedule_node_free(node); + if (is_phase) + node = isl_schedule_node_delete(node); + + return node; +} + +/* Given a branch "node" that contains a sequence node with two phases + * of hybrid tiling as input, remove the two phase marker nodes. + * + * That is, the input is as follows + * + * /- F0 - M0 - ... + * ... - seq + * \- F1 - M1 - ... + * + * and the output is + * + * /- F0 - ... + * ... - seq + * \- F1 - ... + */ +__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks( + __isl_take isl_schedule_node *node) +{ + return hybrid_tile_foreach_phase(node, &drop_phase_mark, NULL); +} diff --git a/polly/lib/External/ppcg/ocl_utilities.h b/polly/lib/External/ppcg/ocl_utilities.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/ocl_utilities.h @@ -0,0 +1,32 @@ +#ifndef OCL_UTILITIES_H +#define OCL_UTILITIES_H + +#if defined(__APPLE__) +#include +#else +#include +#endif + +/* Return the OpenCL error string for a given error number. + */ +const char *opencl_error_string(cl_int error); + +/* Find a GPU or a CPU associated with the first available platform. + * If use_gpu is set, then this function first tries to look for a GPU + * in the first available platform. + * If this fails or if use_gpu is not set, then it tries to use the CPU. + */ +cl_device_id opencl_create_device(int use_gpu); + +/* Create an OpenCL program from a string and compile it. + */ +cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev, + const char *program_source, size_t program_size, + const char *opencl_options); + +/* Create an OpenCL program from a source file and compile it. + */ +cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev, + const char* filename, const char* opencl_options); + +#endif diff --git a/polly/lib/External/ppcg/ocl_utilities.c b/polly/lib/External/ppcg/ocl_utilities.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/ocl_utilities.c @@ -0,0 +1,174 @@ +#include +#include +#include "ocl_utilities.h" + +/* Return the OpenCL error string for a given error number. + */ +const char *opencl_error_string(cl_int error) +{ + int errorCount; + int index; + + static const char *errorString[] = { + [CL_SUCCESS] = "CL_SUCCESS", + [-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND", + [-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE", + [-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE", + [-CL_MEM_OBJECT_ALLOCATION_FAILURE] = + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + [-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES", + [-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY", + [-CL_PROFILING_INFO_NOT_AVAILABLE] = + "CL_PROFILING_INFO_NOT_AVAILABLE", + [-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP", + [-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH", + [-CL_IMAGE_FORMAT_NOT_SUPPORTED] = + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + [-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE", + [-CL_MAP_FAILURE] = "CL_MAP_FAILURE", + [-CL_INVALID_VALUE] = "CL_INVALID_VALUE", + [-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE", + [-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM", + [-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE", + [-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT", + [-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES", + [-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE", + [-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR", + [-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT", + [-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] = + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + [-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE", + [-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER", + [-CL_INVALID_BINARY] = "CL_INVALID_BINARY", + [-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS", + [-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM", + [-CL_INVALID_PROGRAM_EXECUTABLE] = + "CL_INVALID_PROGRAM_EXECUTABLE", + [-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME", + [-CL_INVALID_KERNEL_DEFINITION] = + "CL_INVALID_KERNEL_DEFINITION", + [-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL", + [-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX", + [-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE", + [-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE", + [-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS", + [-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION", + [-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE", + [-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE", + [-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET", + [-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST", + [-CL_INVALID_EVENT] = "CL_INVALID_EVENT", + [-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION", + [-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT", + [-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE", + [-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL", + [-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE", + [-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY" + }; + + errorCount = sizeof(errorString) / sizeof(errorString[0]); + index = -error; + + return (index >= 0 && index < errorCount) ? + errorString[index] : "Unspecified Error"; +} + +/* Find a GPU or a CPU associated with the first available platform. + * If use_gpu is set, then this function first tries to look for a GPU + * in the first available platform. + * If this fails or if use_gpu is not set, then it tries to use the CPU. + */ +cl_device_id opencl_create_device(int use_gpu) +{ + cl_platform_id platform; + cl_device_id dev; + int err; + + err = clGetPlatformIDs(1, &platform, NULL); + if (err < 0) { + fprintf(stderr, "Error %s while looking for a platform.\n", + opencl_error_string(err)); + exit(1); + } + + err = CL_DEVICE_NOT_FOUND; + if (use_gpu) + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, + NULL); + if (err == CL_DEVICE_NOT_FOUND) + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, + NULL); + if (err < 0) { + fprintf(stderr, "Error %s while looking for a device.\n", + opencl_error_string(err)); + exit(1); + } + return dev; +} + +/* Create an OpenCL program from a string and compile it. + */ +cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev, + const char *program_source, size_t program_size, + const char *opencl_options) +{ + int err; + cl_program program; + char *program_log; + size_t log_size; + + program = clCreateProgramWithSource(ctx, 1, + &program_source, &program_size, &err); + if (err < 0) { + fprintf(stderr, "Could not create the program\n"); + exit(1); + } + err = clBuildProgram(program, 0, NULL, opencl_options, NULL, NULL); + if (err < 0) { + fprintf(stderr, "Could not build the program.\n"); + clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, + NULL, &log_size); + program_log = (char *) malloc(log_size + 1); + program_log[log_size] = '\0'; + clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, + log_size + 1, program_log, NULL); + fprintf(stderr, "%s\n", program_log); + free(program_log); + exit(1); + } + return program; +} + +/* Create an OpenCL program from a source file and compile it. + */ +cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev, + const char* filename, const char* opencl_options) +{ + cl_program program; + FILE *program_file; + char *program_source; + size_t program_size, read; + + program_file = fopen(filename, "r"); + if (program_file == NULL) { + fprintf(stderr, "Could not find the source file.\n"); + exit(1); + } + fseek(program_file, 0, SEEK_END); + program_size = ftell(program_file); + rewind(program_file); + program_source = (char *) malloc(program_size + 1); + program_source[program_size] = '\0'; + read = fread(program_source, sizeof(char), program_size, program_file); + if (read != program_size) { + fprintf(stderr, "Error while reading the kernel.\n"); + exit(1); + } + fclose(program_file); + + program = opencl_build_program_from_string(ctx, dev, program_source, + program_size, opencl_options); + free(program_source); + + return program; +} diff --git a/polly/lib/External/ppcg/opencl.h b/polly/lib/External/ppcg/opencl.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/opencl.h @@ -0,0 +1,11 @@ +#ifndef _OPENCL_H +#define _OPENCL_H + +#include +#include "ppcg_options.h" +#include "ppcg.h" + +int generate_opencl(isl_ctx *ctx, struct ppcg_options *options, + const char *input, const char *output); + +#endif diff --git a/polly/lib/External/ppcg/opencl_test.sh.in b/polly/lib/External/ppcg/opencl_test.sh.in new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/opencl_test.sh.in @@ -0,0 +1,78 @@ +#!/bin/sh + +keep=no + +for option; do + case "$option" in + --keep) + keep=yes + ;; + esac +done + +EXEEXT=@EXEEXT@ +VERSION=@GIT_HEAD_VERSION@ +CC="@CC@" +CFLAGS="--std=gnu99" +srcdir="@srcdir@" + +if [ $keep = "yes" ]; then + OUTDIR="opencl_test.$VERSION" + mkdir "$OUTDIR" || exit 1 +else + if test "x$TMPDIR" = "x"; then + TMPDIR=/tmp + fi + OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1 +fi + +run_tests () { + subdir=$1 + ppcg_options=$2 + + echo Test with PPCG options \'$ppcg_options\' + mkdir ${OUTDIR}/${subdir} || exit 1 + for i in $srcdir/tests/*.c; do + echo $i + name=`basename $i` + name="${name%.c}" + out_c="${OUTDIR}/${subdir}/$name.ppcg.c" + out="${OUTDIR}/${subdir}/$name.ppcg$EXEEXT" + options="--target=opencl --opencl-no-use-gpu $ppcg_options" + functions="$srcdir/tests/${name}_opencl_functions.cl" + if test -f $functions; then + options="$options --opencl-include-file=$functions" + options="$options --opencl-compiler-options=-I." + fi + ./ppcg$EXEEXT $options $i -o "$out_c" || exit + $CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \ + -I. "$out_c" -o "$out" || exit + $out || exit + done +} + +run_tests default +run_tests embed --opencl-embed-kernel-code + +for i in $srcdir/examples/*.c; do + echo $i + name=`basename $i` + name="${name%.c}" + exe_ref="${OUTDIR}/$name.ref$EXEEXT" + gen_ocl="${OUTDIR}/$name.ppcg.c" + exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT" + output_ref="${OUTDIR}/$name.ref.out" + output_ocl="${OUTDIR}/$name.ppcg.out" + $CC $CFLAGS $i -o $exe_ref || exit + ./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \ + exit + $CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \ + "$gen_ocl" -o "$exe_ocl" || exit + $exe_ref > $output_ref || exit + $exe_ocl > $output_ocl || exit + cmp $output_ref $output_ocl || exit +done + +if [ $keep = "no" ]; then + rm -r "${OUTDIR}" +fi diff --git a/polly/lib/External/ppcg/polybench_test.sh.in b/polly/lib/External/ppcg/polybench_test.sh.in new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/polybench_test.sh.in @@ -0,0 +1,109 @@ +#!/bin/sh + +keep=no +verbose=no + +for option; do + case "$option" in + --keep) + keep=yes + ;; + --verbose) + verbose=yes + ;; + esac +done + +EXEEXT=@EXEEXT@ +DIR=@POLYBENCH_DIR@ +VERSION=@GIT_HEAD_VERSION@ +SIZE=-DMINI_DATASET +CC="@CC@" +HAVE_OPENCL=@HAVE_OPENCL@ +HAVE_OPENMP=@HAVE_OPENMP@ +srcdir="@srcdir@" +if [ $keep = "yes" ]; then + OUTDIR="out.$VERSION" + mkdir "$OUTDIR" || exit 1 +else + if test "x$TMPDIR" = "x"; then + TMPDIR=/tmp + fi + OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1 +fi +CPPFLAGS="-DPOLYBENCH_USE_C99_PROTO -DPOLYBENCH_DUMP_ARRAYS" +CPPFLAGS="$CPPFLAGS $SIZE -I $DIR/utilities" +CFLAGS="-lm --std=gnu99" + +echo "Running tests in folder ${OUTDIR}" + +run_tests () { + ext=$1 + + ppcg_options=$2 + cc_options=$3 + + if [ "x$ppcg_options" = "x" ]; then + ppcg_option_str="none" + else + ppcg_option_str=$ppcg_options + fi + + if [ "x$cc_options" = "x" ]; then + cc_option_str="none" + else + cc_option_str=$cc_options + fi + + echo Test: $ext, ppcg options: $ppcg_option_str, CC options: $cc_option_str + for i in `cat $DIR/utilities/benchmark_list`; do + echo $i + name=`basename $i` + name=${name%.c} + source_opt="${OUTDIR}/$name.$ext.c" + prog_orig=${OUTDIR}/$name.orig${EXEEXT} + prog_opt=${OUTDIR}/$name.$ext${EXEEXT} + output_orig=${OUTDIR}/$name.orig.out + output_opt=${OUTDIR}/$name.$ext.out + dir=`dirname $i` + if [ $verbose = "yes" ]; then + echo ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i \ + $CPPFLAGS -o $source_opt $ppcg_options + fi + ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i $CPPFLAGS \ + -o $source_opt $ppcg_options || exit + $CC -I $DIR/$dir $CPPFLAGS $DIR/$i -o $prog_orig \ + $DIR/utilities/polybench.c $CFLAGS + $prog_orig 2> $output_orig + if [ $verbose = "yes" ]; then + echo $CC -I $DIR/$dir $CPPFLAGS $source_opt \ + -o $prog_opt $DIR/utilities/polybench.c \ + $CFLAGS $cc_options + fi + $CC -I $DIR/$dir $CPPFLAGS $source_opt -o $prog_opt \ + $DIR/utilities/polybench.c $CFLAGS $cc_options || exit + + $prog_opt 2> $output_opt + cmp $output_orig $output_opt || exit + done +} + +run_tests ppcg "--target=c --tile" +run_tests ppcg_live "--target=c --no-live-range-reordering --tile" + +# Test OpenMP code, if compiler supports openmp +if [ $HAVE_OPENMP = "yes" ]; then + run_tests ppcg_omp "--target=c --openmp" -fopenmp + echo Introduced `grep -R 'omp parallel' "${OUTDIR}" | wc -l` '"pragma omp parallel for"' +else + echo Compiler does not support OpenMP. Skipping OpenMP tests. +fi + +if [ $HAVE_OPENCL = "yes" ]; then + run_tests ppcg_opencl "--target=opencl --opencl-no-use-gpu" \ + "-I $srcdir $srcdir/ocl_utilities.c -lOpenCL" +fi + +if [ $keep = "no" ]; then + rm -r "${OUTDIR}" +fi diff --git a/polly/lib/External/ppcg/ppcg.h b/polly/lib/External/ppcg/ppcg.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/ppcg.h @@ -0,0 +1,128 @@ +#ifndef PPCG_H +#define PPCG_H + +#include +#include +#include +#include +#include +#include + +#include "ppcg_options.h" + +const char *ppcg_base_name(const char *filename); +int ppcg_extract_base_name(char *name, const char *input); + +/* Representation of the scop for use inside PPCG. + * + * "options" are the options specified by the user. + * Some fields in this structure may depend on some of the options. + * + * "start" and "end" are file offsets of the corresponding program text. + * "context" represents constraints on the parameters. + * "domain" is the union of all iteration domains. + * "call" contains the iteration domains of statements with a call expression. + * "reads" contains all potential read accesses. + * "tagged_reads" is the same as "reads", except that the domain is a wrapped + * relation mapping an iteration domain to a reference identifier + * "live_in" contains the potential read accesses that potentially + * have no corresponding writes in the scop. + * "may_writes" contains all potential write accesses. + * "tagged_may_writes" is the same as "may_writes", except that the domain + * is a wrapped relation mapping an iteration domain + * to a reference identifier + * "must_writes" contains all definite write accesses. + * "tagged_must_writes" is the same as "must_writes", except that the domain + * is a wrapped relation mapping an iteration domain + * to a reference identifier + * "live_out" contains the potential write accesses that are potentially + * not killed by any kills or any other writes. + * "must_kills" contains all definite kill accesses. + * "tagged_must_kills" is the same as "must_kills", except that the domain + * is a wrapped relation mapping an iteration domain + * to a reference identifier. + * + * "tagger" maps tagged iteration domains to the corresponding untagged + * iteration domain. + * + * "independence" is the union of all independence filters. + * + * "dep_flow" represents the potential flow dependences. + * "tagged_dep_flow" is the same as "dep_flow", except that both domain and + * range are wrapped relations mapping an iteration domain to + * a reference identifier. May be NULL if not computed. + * "dep_false" represents the potential false (anti and output) dependences. + * "dep_forced" represents the validity constraints that should be enforced + * even when live-range reordering is used. + * In particular, these constraints ensure that all live-in + * accesses remain live-in and that all live-out accesses remain live-out + * and that multiple potential sources for the same read are + * executed in the original order. + * "dep_order"/"tagged_dep_order" represents the order dependences between + * the live range intervals in "dep_flow"/"tagged_dep_flow". + * It is only used if the live_range_reordering + * option is set. Otherwise it is NULL. + * If "dep_order" is used, then "dep_false" only contains a limited + * set of anti and output dependences. + * "schedule" represents the (original) schedule. + * + * "names" contains all variable names that are in use by the scop. + * The names are mapped to a dummy value. + * + * "pet" is the original pet_scop. + */ +struct ppcg_scop { + struct ppcg_options *options; + + unsigned start; + unsigned end; + + isl_set *context; + isl_union_set *domain; + isl_union_set *call; + isl_union_map *tagged_reads; + isl_union_map *reads; + isl_union_map *live_in; + isl_union_map *tagged_may_writes; + isl_union_map *may_writes; + isl_union_map *tagged_must_writes; + isl_union_map *must_writes; + isl_union_map *live_out; + isl_union_map *tagged_must_kills; + isl_union_map *must_kills; + + isl_union_pw_multi_aff *tagger; + + isl_union_map *independence; + + isl_union_map *dep_flow; + isl_union_map *tagged_dep_flow; + isl_union_map *dep_false; + isl_union_map *dep_forced; + isl_union_map *dep_order; + isl_union_map *tagged_dep_order; + isl_schedule *schedule; + + isl_id_to_ast_expr *names; + + struct pet_scop *pet; +}; + +int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop); +__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop, + int n, const char *prefix); + +int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out, + struct ppcg_options *options, + __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, + struct ppcg_scop *scop, void *user), void *user); + +__isl_give isl_schedule *ppcg_compute_schedule( + __isl_take isl_schedule_constraints *sc, + __isl_keep isl_schedule *schedule, struct ppcg_options *options); + +void compute_tagger(struct ppcg_scop *ps); +void compute_dependences(struct ppcg_scop *scop); +void eliminate_dead_code(struct ppcg_scop *ps); +void *ppcg_scop_free(struct ppcg_scop *ps); +#endif diff --git a/polly/lib/External/ppcg/ppcg.c b/polly/lib/External/ppcg/ppcg.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/ppcg.c @@ -0,0 +1,1067 @@ +/* + * Copyright 2011 INRIA Saclay + * Copyright 2013 Ecole Normale Superieure + * Copyright 2015 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, + * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, + * 91893 Orsay, France + * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ppcg.h" +#include "ppcg_options.h" +#include "cuda.h" +#include "opencl.h" +#include "cpu.h" + +struct options { + struct pet_options *pet; + struct ppcg_options *ppcg; + char *input; + char *output; +}; + +const char *ppcg_version(void); +static void print_version(void) +{ + printf("%s", ppcg_version()); +} + +ISL_ARGS_START(struct options, options_args) +ISL_ARG_CHILD(struct options, pet, "pet", &pet_options_args, "pet options") +ISL_ARG_CHILD(struct options, ppcg, NULL, &ppcg_options_args, "ppcg options") +ISL_ARG_STR(struct options, output, 'o', NULL, + "filename", NULL, "output filename (c and opencl targets)") +ISL_ARG_ARG(struct options, input, "input", NULL) +ISL_ARG_VERSION(print_version) +ISL_ARGS_END + +ISL_ARG_DEF(options, struct options, options_args) + +/* Return a pointer to the final path component of "filename" or + * to "filename" itself if it does not contain any components. + */ +const char *ppcg_base_name(const char *filename) +{ + const char *base; + + base = strrchr(filename, '/'); + if (base) + return ++base; + else + return filename; +} + +/* Copy the base name of "input" to "name" and return its length. + * "name" is not NULL terminated. + * + * In particular, remove all leading directory components and + * the final extension, if any. + */ +int ppcg_extract_base_name(char *name, const char *input) +{ + const char *base; + const char *ext; + int len; + + base = ppcg_base_name(input); + ext = strrchr(base, '.'); + len = ext ? ext - base : strlen(base); + + memcpy(name, base, len); + + return len; +} + +/* Does "scop" refer to any arrays that are declared, but not + * exposed to the code after the scop? + */ +int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop) +{ + int i; + + if (!scop) + return 0; + + // This is a pet feature not available in Polly. + return 0; + + for (i = 0; i < scop->pet->n_array; ++i) + if (scop->pet->arrays[i]->declared && + !scop->pet->arrays[i]->exposed) + return 1; + + return 0; +} + +/* Collect all variable names that are in use in "scop". + * In particular, collect all parameters in the context and + * all the array names. + * Store these names in an isl_id_to_ast_expr by mapping + * them to a dummy value (0). + */ +static __isl_give isl_id_to_ast_expr *collect_names(struct pet_scop *scop) +{ + int i, n; + isl_ctx *ctx; + isl_ast_expr *zero; + isl_id_to_ast_expr *names; + + ctx = isl_set_get_ctx(scop->context); + + n = isl_set_dim(scop->context, isl_dim_param); + + names = isl_id_to_ast_expr_alloc(ctx, n + scop->n_array); + zero = isl_ast_expr_from_val(isl_val_zero(ctx)); + + for (i = 0; i < n; ++i) { + isl_id *id; + + id = isl_set_get_dim_id(scop->context, isl_dim_param, i); + names = isl_id_to_ast_expr_set(names, + id, isl_ast_expr_copy(zero)); + } + + for (i = 0; i < scop->n_array; ++i) { + struct pet_array *array = scop->arrays[i]; + isl_id *id; + + id = isl_set_get_tuple_id(array->extent); + names = isl_id_to_ast_expr_set(names, + id, isl_ast_expr_copy(zero)); + } + + isl_ast_expr_free(zero); + + return names; +} + +/* Return an isl_id called "prefix%d", with "%d" set to "i". + * If an isl_id with such a name already appears among the variable names + * of "scop", then adjust the name to "prefix%d_%d". + */ +static __isl_give isl_id *generate_name(struct ppcg_scop *scop, + const char *prefix, int i) +{ + int j; + char name[16]; + isl_ctx *ctx; + isl_id *id; + int has_name; + + ctx = isl_set_get_ctx(scop->context); + snprintf(name, sizeof(name), "%s%d", prefix, i); + id = isl_id_alloc(ctx, name, NULL); + + j = 0; + while ((has_name = isl_id_to_ast_expr_has(scop->names, id)) == 1) { + isl_id_free(id); + snprintf(name, sizeof(name), "%s%d_%d", prefix, i, j++); + id = isl_id_alloc(ctx, name, NULL); + } + + return has_name < 0 ? isl_id_free(id) : id; +} + +/* Return a list of "n" isl_ids of the form "prefix%d". + * If an isl_id with such a name already appears among the variable names + * of "scop", then adjust the name to "prefix%d_%d". + */ +__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop, + int n, const char *prefix) +{ + int i; + isl_ctx *ctx; + isl_id_list *names; + + ctx = isl_set_get_ctx(scop->context); + names = isl_id_list_alloc(ctx, n); + for (i = 0; i < n; ++i) { + isl_id *id; + + id = generate_name(scop, prefix, i); + names = isl_id_list_add(names, id); + } + + return names; +} + +/* Is "stmt" not a kill statement? + */ +static int is_not_kill(struct pet_stmt *stmt) +{ + return !pet_stmt_is_kill(stmt); +} + +/* Collect the iteration domains of the statements in "scop" that + * satisfy "pred". + */ +static __isl_give isl_union_set *collect_domains(struct pet_scop *scop, + int (*pred)(struct pet_stmt *stmt)) +{ + int i; + isl_set *domain_i; + isl_union_set *domain; + + if (!scop) + return NULL; + + domain = isl_union_set_empty(isl_set_get_space(scop->context)); + + for (i = 0; i < scop->n_stmt; ++i) { + struct pet_stmt *stmt = scop->stmts[i]; + + if (!pred(stmt)) + continue; + + if (stmt->n_arg > 0) + isl_die(isl_union_set_get_ctx(domain), + isl_error_unsupported, + "data dependent conditions not supported", + return isl_union_set_free(domain)); + + domain_i = isl_set_copy(scop->stmts[i]->domain); + domain = isl_union_set_add_set(domain, domain_i); + } + + return domain; +} + +/* Collect the iteration domains of the statements in "scop", + * skipping kill statements. + */ +static __isl_give isl_union_set *collect_non_kill_domains(struct pet_scop *scop) +{ + return collect_domains(scop, &is_not_kill); +} + +/* This function is used as a callback to pet_expr_foreach_call_expr + * to detect if there is any call expression in the input expression. + * Assign the value 1 to the integer that "user" points to and + * abort the search since we have found what we were looking for. + */ +static int set_has_call(__isl_keep pet_expr *expr, void *user) +{ + int *has_call = user; + + *has_call = 1; + + return -1; +} + +/* Does "expr" contain any call expressions? + */ +static int expr_has_call(__isl_keep pet_expr *expr) +{ + int has_call = 0; + + if (pet_expr_foreach_call_expr(expr, &set_has_call, &has_call) < 0 && + !has_call) + return -1; + + return has_call; +} + +/* This function is a callback for pet_tree_foreach_expr. + * If "expr" contains any call (sub)expressions, then set *has_call + * and abort the search. + */ +static int check_call(__isl_keep pet_expr *expr, void *user) +{ + int *has_call = user; + + if (expr_has_call(expr)) + *has_call = 1; + + return *has_call ? -1 : 0; +} + +/* Does "stmt" contain any call expressions? + */ +static int has_call(struct pet_stmt *stmt) +{ + int has_call = 0; + + if (pet_tree_foreach_expr(stmt->body, &check_call, &has_call) < 0 && + !has_call) + return -1; + + return has_call; +} + +/* Collect the iteration domains of the statements in "scop" + * that contain a call expression. + */ +static __isl_give isl_union_set *collect_call_domains(struct pet_scop *scop) +{ + return collect_domains(scop, &has_call); +} + +/* Given a union of "tagged" access relations of the form + * + * [S_i[...] -> R_j[]] -> A_k[...] + * + * project out the "tags" (R_j[]). + * That is, return a union of relations of the form + * + * S_i[...] -> A_k[...] + */ +static __isl_give isl_union_map *project_out_tags( + __isl_take isl_union_map *umap) +{ + return isl_union_map_domain_factor_domain(umap); +} + +/* Construct a function from tagged iteration domains to the corresponding + * untagged iteration domains with as range of the wrapped map in the domain + * the reference tags that appear in any of the reads, writes or kills. + * Store the result in ps->tagger. + * + * For example, if the statement with iteration space S[i,j] + * contains two array references R_1[] and R_2[], then ps->tagger will contain + * + * { [S[i,j] -> R_1[]] -> S[i,j]; [S[i,j] -> R_2[]] -> S[i,j] } + */ +void compute_tagger(struct ppcg_scop *ps) +{ + isl_union_map *tagged; + isl_union_pw_multi_aff *tagger; + + tagged = isl_union_map_copy(ps->tagged_reads); + tagged = isl_union_map_union(tagged, + isl_union_map_copy(ps->tagged_may_writes)); + tagged = isl_union_map_union(tagged, + isl_union_map_copy(ps->tagged_must_kills)); + tagged = isl_union_map_universe(tagged); + tagged = isl_union_set_unwrap(isl_union_map_domain(tagged)); + + tagger = isl_union_map_domain_map_union_pw_multi_aff(tagged); + + ps->tagger = tagger; +} + +/* Compute the live out accesses, i.e., the writes that are + * potentially not killed by any kills or any other writes, and + * store them in ps->live_out. + * + * We compute the "dependence" of any "kill" (an explicit kill + * or a must write) on any may write. + * The elements accessed by the may writes with a "depending" kill + * also accessing the element are definitely killed. + * The remaining may writes can potentially be live out. + * + * The result of the dependence analysis is + * + * { IW -> [IK -> A] } + * + * with IW the instance of the write statement, IK the instance of kill + * statement and A the element that was killed. + * The range factor range is + * + * { IW -> A } + * + * containing all such pairs for which there is a kill statement instance, + * i.e., all pairs that have been killed. + */ +static void compute_live_out(struct ppcg_scop *ps) +{ + isl_schedule *schedule; + isl_union_map *kills; + isl_union_map *exposed; + isl_union_map *covering; + isl_union_access_info *access; + isl_union_flow *flow; + + schedule = isl_schedule_copy(ps->schedule); + kills = isl_union_map_union(isl_union_map_copy(ps->must_writes), + isl_union_map_copy(ps->must_kills)); + access = isl_union_access_info_from_sink(kills); + access = isl_union_access_info_set_may_source(access, + isl_union_map_copy(ps->may_writes)); + access = isl_union_access_info_set_schedule(access, schedule); + flow = isl_union_access_info_compute_flow(access); + covering = isl_union_flow_get_full_may_dependence(flow); + isl_union_flow_free(flow); + + covering = isl_union_map_range_factor_range(covering); + exposed = isl_union_map_copy(ps->may_writes); + exposed = isl_union_map_subtract(exposed, covering); + ps->live_out = exposed; +} + +/* Compute the tagged flow dependences and the live_in accesses and store + * the results in ps->tagged_dep_flow and ps->live_in. + * + * We allow both the must writes and the must kills to serve as + * definite sources such that a subsequent read would not depend + * on any earlier write. The resulting flow dependences with + * a must kill as source reflect possibly uninitialized reads. + * No dependences need to be introduced to protect such reads + * (other than those imposed by potential flows from may writes + * that follow the kill). We therefore remove those flow dependences. + * This is also useful for the dead code elimination, which assumes + * the flow sources are non-kill instances. + */ +static void compute_tagged_flow_dep_only(struct ppcg_scop *ps) +{ + isl_union_pw_multi_aff *tagger; + isl_schedule *schedule; + isl_union_map *live_in; + isl_union_access_info *access; + isl_union_flow *flow; + isl_union_map *must_source; + isl_union_map *kills; + isl_union_map *tagged_flow; + + tagger = isl_union_pw_multi_aff_copy(ps->tagger); + schedule = isl_schedule_copy(ps->schedule); + schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger); + kills = isl_union_map_copy(ps->tagged_must_kills); + must_source = isl_union_map_copy(ps->tagged_must_writes); + must_source = isl_union_map_union(must_source, + isl_union_map_copy(kills)); + access = isl_union_access_info_from_sink( + isl_union_map_copy(ps->tagged_reads)); + access = isl_union_access_info_set_must_source(access, must_source); + access = isl_union_access_info_set_may_source(access, + isl_union_map_copy(ps->tagged_may_writes)); + access = isl_union_access_info_set_schedule(access, schedule); + flow = isl_union_access_info_compute_flow(access); + tagged_flow = isl_union_flow_get_may_dependence(flow); + tagged_flow = isl_union_map_subtract_domain(tagged_flow, + isl_union_map_domain(kills)); + ps->tagged_dep_flow = tagged_flow; + live_in = isl_union_flow_get_may_no_source(flow); + ps->live_in = project_out_tags(live_in); + isl_union_flow_free(flow); +} + +/* Compute ps->dep_flow from ps->tagged_dep_flow + * by projecting out the reference tags. + */ +static void derive_flow_dep_from_tagged_flow_dep(struct ppcg_scop *ps) +{ + ps->dep_flow = isl_union_map_copy(ps->tagged_dep_flow); + ps->dep_flow = isl_union_map_factor_domain(ps->dep_flow); +} + +/* Compute the flow dependences and the live_in accesses and store + * the results in ps->dep_flow and ps->live_in. + * A copy of the flow dependences, tagged with the reference tags + * is stored in ps->tagged_dep_flow. + * + * We first compute ps->tagged_dep_flow, i.e., the tagged flow dependences + * and then project out the tags. + */ +static void compute_tagged_flow_dep(struct ppcg_scop *ps) +{ + compute_tagged_flow_dep_only(ps); + derive_flow_dep_from_tagged_flow_dep(ps); +} + +/* Compute the order dependences that prevent the potential live ranges + * from overlapping. + * + * In particular, construct a union of relations + * + * [R[...] -> R_1[]] -> [W[...] -> R_2[]] + * + * where [R[...] -> R_1[]] is the range of one or more live ranges + * (i.e., a read) and [W[...] -> R_2[]] is the domain of one or more + * live ranges (i.e., a write). Moreover, the read and the write + * access the same memory element and the read occurs before the write + * in the original schedule. + * The scheduler allows some of these dependences to be violated, provided + * the adjacent live ranges are all local (i.e., their domain and range + * are mapped to the same point by the current schedule band). + * + * Note that if a live range is not local, then we need to make + * sure it does not overlap with _any_ other live range, and not + * just with the "previous" and/or the "next" live range. + * We therefore add order dependences between reads and + * _any_ later potential write. + * + * We also need to be careful about writes without a corresponding read. + * They are already prevented from moving past non-local preceding + * intervals, but we also need to prevent them from moving past non-local + * following intervals. We therefore also add order dependences from + * potential writes that do not appear in any intervals + * to all later potential writes. + * Note that dead code elimination should have removed most of these + * dead writes, but the dead code elimination may not remove all dead writes, + * so we need to consider them to be safe. + * + * The order dependences are computed by computing the "dataflow" + * from the above unmatched writes and the reads to the may writes. + * The unmatched writes and the reads are treated as may sources + * such that they would not kill order dependences from earlier + * such writes and reads. + */ +static void compute_order_dependences(struct ppcg_scop *ps) +{ + isl_union_map *reads; + isl_union_map *shared_access; + isl_union_set *matched; + isl_union_map *unmatched; + isl_union_pw_multi_aff *tagger; + isl_schedule *schedule; + isl_union_access_info *access; + isl_union_flow *flow; + + tagger = isl_union_pw_multi_aff_copy(ps->tagger); + schedule = isl_schedule_copy(ps->schedule); + schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger); + reads = isl_union_map_copy(ps->tagged_reads); + matched = isl_union_map_domain(isl_union_map_copy(ps->tagged_dep_flow)); + unmatched = isl_union_map_copy(ps->tagged_may_writes); + unmatched = isl_union_map_subtract_domain(unmatched, matched); + reads = isl_union_map_union(reads, unmatched); + access = isl_union_access_info_from_sink( + isl_union_map_copy(ps->tagged_may_writes)); + access = isl_union_access_info_set_may_source(access, reads); + access = isl_union_access_info_set_schedule(access, schedule); + flow = isl_union_access_info_compute_flow(access); + shared_access = isl_union_flow_get_may_dependence(flow); + isl_union_flow_free(flow); + + ps->tagged_dep_order = isl_union_map_copy(shared_access); + ps->dep_order = isl_union_map_factor_domain(shared_access); +} + +/* Compute those validity dependences of the program represented by "scop" + * that should be unconditionally enforced even when live-range reordering + * is used. + * + * In particular, compute the external false dependences + * as well as order dependences between sources with the same sink. + * The anti-dependences are already taken care of by the order dependences. + * The external false dependences are only used to ensure that live-in and + * live-out data is not overwritten by any writes inside the scop. + * The independences are removed from the external false dependences, + * but not from the order dependences between sources with the same sink. + * + * In particular, the reads from live-in data need to precede any + * later write to the same memory element. + * As to live-out data, the last writes need to remain the last writes. + * That is, any earlier write in the original schedule needs to precede + * the last write to the same memory element in the computed schedule. + * The possible last writes have been computed by compute_live_out. + * They may include kills, but if the last access is a kill, + * then the corresponding dependences will effectively be ignored + * since we do not schedule any kill statements. + * + * Note that the set of live-in and live-out accesses may be + * an overapproximation. There may therefore be potential writes + * before a live-in access and after a live-out access. + * + * In the presence of may-writes, there may be multiple live-ranges + * with the same sink, accessing the same memory element. + * The sources of these live-ranges need to be executed + * in the same relative order as in the original program + * since we do not know which of the may-writes will actually + * perform a write. Consider all sources that share a sink and + * that may write to the same memory element and compute + * the order dependences among them. + */ +static void compute_forced_dependences(struct ppcg_scop *ps) +{ + isl_union_map *shared_access; + isl_union_map *exposed; + isl_union_map *live_in; + isl_union_map *sink_access; + isl_union_map *shared_sink; + isl_union_access_info *access; + isl_union_flow *flow; + isl_schedule *schedule; + + exposed = isl_union_map_copy(ps->live_out); + schedule = isl_schedule_copy(ps->schedule); + access = isl_union_access_info_from_sink(exposed); + access = isl_union_access_info_set_may_source(access, + isl_union_map_copy(ps->may_writes)); + access = isl_union_access_info_set_schedule(access, schedule); + flow = isl_union_access_info_compute_flow(access); + shared_access = isl_union_flow_get_may_dependence(flow); + isl_union_flow_free(flow); + ps->dep_forced = shared_access; + + schedule = isl_schedule_copy(ps->schedule); + access = isl_union_access_info_from_sink( + isl_union_map_copy(ps->may_writes)); + access = isl_union_access_info_set_may_source(access, + isl_union_map_copy(ps->live_in)); + access = isl_union_access_info_set_schedule(access, schedule); + flow = isl_union_access_info_compute_flow(access); + live_in = isl_union_flow_get_may_dependence(flow); + isl_union_flow_free(flow); + + ps->dep_forced = isl_union_map_union(ps->dep_forced, live_in); + ps->dep_forced = isl_union_map_subtract(ps->dep_forced, + isl_union_map_copy(ps->independence)); + + schedule = isl_schedule_copy(ps->schedule); + sink_access = isl_union_map_copy(ps->tagged_dep_flow); + sink_access = isl_union_map_range_product(sink_access, + isl_union_map_copy(ps->tagged_may_writes)); + sink_access = isl_union_map_domain_factor_domain(sink_access); + access = isl_union_access_info_from_sink( + isl_union_map_copy(sink_access)); + access = isl_union_access_info_set_may_source(access, sink_access); + access = isl_union_access_info_set_schedule(access, schedule); + flow = isl_union_access_info_compute_flow(access); + shared_sink = isl_union_flow_get_may_dependence(flow); + isl_union_flow_free(flow); + ps->dep_forced = isl_union_map_union(ps->dep_forced, shared_sink); +} + +/* Remove independence from the tagged flow dependences. + * Since the user has guaranteed that source and sink of an independence + * can be executed in any order, there cannot be a flow dependence + * between them, so they can be removed from the set of flow dependences. + * However, if the source of such a flow dependence is a must write, + * then it may have killed other potential sources, which would have + * to be recovered if we were to remove those flow dependences. + * We therefore keep the flow dependences that originate in a must write, + * even if it corresponds to a known independence. + */ +static void remove_independences_from_tagged_flow(struct ppcg_scop *ps) +{ + isl_union_map *tf; + isl_union_set *indep; + isl_union_set *mw; + + tf = isl_union_map_copy(ps->tagged_dep_flow); + tf = isl_union_map_zip(tf); + indep = isl_union_map_wrap(isl_union_map_copy(ps->independence)); + tf = isl_union_map_intersect_domain(tf, indep); + tf = isl_union_map_zip(tf); + mw = isl_union_map_domain(isl_union_map_copy(ps->tagged_must_writes)); + tf = isl_union_map_subtract_domain(tf, mw); + ps->tagged_dep_flow = isl_union_map_subtract(ps->tagged_dep_flow, tf); +} + +/* Compute the dependences of the program represented by "scop" + * in case live range reordering is allowed. + * + * We compute the actual live ranges and the corresponding order + * false dependences. + * + * The independences are removed from the flow dependences + * (provided the source is not a must-write) as well as + * from the external false dependences (by compute_forced_dependences). + */ +static void compute_live_range_reordering_dependences(struct ppcg_scop *ps) +{ + compute_tagged_flow_dep_only(ps); + remove_independences_from_tagged_flow(ps); + derive_flow_dep_from_tagged_flow_dep(ps); + compute_order_dependences(ps); + compute_forced_dependences(ps); +} + +/* Compute the potential flow dependences and the potential live in + * accesses. + */ +static void compute_flow_dep(struct ppcg_scop *ps) +{ + isl_union_access_info *access; + isl_union_flow *flow; + + access = isl_union_access_info_from_sink(isl_union_map_copy(ps->reads)); + access = isl_union_access_info_set_must_source(access, + isl_union_map_copy(ps->must_writes)); + access = isl_union_access_info_set_may_source(access, + isl_union_map_copy(ps->may_writes)); + access = isl_union_access_info_set_schedule(access, + isl_schedule_copy(ps->schedule)); + flow = isl_union_access_info_compute_flow(access); + + ps->dep_flow = isl_union_flow_get_may_dependence(flow); + ps->live_in = isl_union_flow_get_may_no_source(flow); + isl_union_flow_free(flow); +} + +/* Compute the dependences of the program represented by "scop". + * Store the computed potential flow dependences + * in scop->dep_flow and the reads with potentially no corresponding writes in + * scop->live_in. + * Store the potential live out accesses in scop->live_out. + * Store the potential false (anti and output) dependences in scop->dep_false. + * + * If live range reordering is allowed, then we compute a separate + * set of order dependences and a set of external false dependences + * in compute_live_range_reordering_dependences. + */ +void compute_dependences(struct ppcg_scop *scop) +{ + isl_union_map *may_source; + isl_union_access_info *access; + isl_union_flow *flow; + + if (!scop) + return; + + compute_live_out(scop); + + if (scop->options->live_range_reordering) + compute_live_range_reordering_dependences(scop); + else if (scop->options->target != PPCG_TARGET_C) + compute_tagged_flow_dep(scop); + else + compute_flow_dep(scop); + + may_source = isl_union_map_union(isl_union_map_copy(scop->may_writes), + isl_union_map_copy(scop->reads)); + access = isl_union_access_info_from_sink( + isl_union_map_copy(scop->may_writes)); + access = isl_union_access_info_set_must_source(access, + isl_union_map_copy(scop->must_writes)); + access = isl_union_access_info_set_may_source(access, may_source); + access = isl_union_access_info_set_schedule(access, + isl_schedule_copy(scop->schedule)); + flow = isl_union_access_info_compute_flow(access); + + scop->dep_false = isl_union_flow_get_may_dependence(flow); + scop->dep_false = isl_union_map_coalesce(scop->dep_false); + isl_union_flow_free(flow); +} + +/* Eliminate dead code from ps->domain. + * + * In particular, intersect both ps->domain and the domain of + * ps->schedule with the (parts of) iteration + * domains that are needed to produce the output or for statement + * iterations that call functions. + * Also intersect the range of the dataflow dependences with + * this domain such that the removed instances will no longer + * be considered as targets of dataflow. + * + * We start with the iteration domains that call functions + * and the set of iterations that last write to an array + * (except those that are later killed). + * + * Then we add those statement iterations that produce + * something needed by the "live" statements iterations. + * We keep doing this until no more statement iterations can be added. + * To ensure that the procedure terminates, we compute the affine + * hull of the live iterations (bounded to the original iteration + * domains) each time we have added extra iterations. + */ +void eliminate_dead_code(struct ppcg_scop *ps) +{ + isl_union_set *live; + isl_union_map *dep; + isl_union_pw_multi_aff *tagger; + + live = isl_union_map_domain(isl_union_map_copy(ps->live_out)); + if (!isl_union_set_is_empty(ps->call)) { + live = isl_union_set_union(live, isl_union_set_copy(ps->call)); + live = isl_union_set_coalesce(live); + } + + dep = isl_union_map_copy(ps->dep_flow); + dep = isl_union_map_reverse(dep); + + for (;;) { + isl_union_set *extra; + + extra = isl_union_set_apply(isl_union_set_copy(live), + isl_union_map_copy(dep)); + if (isl_union_set_is_subset(extra, live)) { + isl_union_set_free(extra); + break; + } + + live = isl_union_set_union(live, extra); + live = isl_union_set_affine_hull(live); + live = isl_union_set_intersect(live, + isl_union_set_copy(ps->domain)); + } + + isl_union_map_free(dep); + + ps->domain = isl_union_set_intersect(ps->domain, + isl_union_set_copy(live)); + ps->schedule = isl_schedule_intersect_domain(ps->schedule, + isl_union_set_copy(live)); + ps->dep_flow = isl_union_map_intersect_range(ps->dep_flow, + isl_union_set_copy(live)); + tagger = isl_union_pw_multi_aff_copy(ps->tagger); + live = isl_union_set_preimage_union_pw_multi_aff(live, tagger); + ps->tagged_dep_flow = isl_union_map_intersect_range(ps->tagged_dep_flow, + live); +} + +/* Intersect "set" with the set described by "str", taking the NULL + * string to represent the universal set. + */ +static __isl_give isl_set *set_intersect_str(__isl_take isl_set *set, + const char *str) +{ + isl_ctx *ctx; + isl_set *set2; + + if (!str) + return set; + + ctx = isl_set_get_ctx(set); + set2 = isl_set_read_from_str(ctx, str); + set = isl_set_intersect(set, set2); + + return set; +} + +void *ppcg_scop_free(struct ppcg_scop *ps) +{ + if (!ps) + return NULL; + + isl_set_free(ps->context); + isl_union_set_free(ps->domain); + isl_union_set_free(ps->call); + isl_union_map_free(ps->tagged_reads); + isl_union_map_free(ps->reads); + isl_union_map_free(ps->live_in); + isl_union_map_free(ps->tagged_may_writes); + isl_union_map_free(ps->tagged_must_writes); + isl_union_map_free(ps->may_writes); + isl_union_map_free(ps->must_writes); + isl_union_map_free(ps->live_out); + isl_union_map_free(ps->tagged_must_kills); + isl_union_map_free(ps->must_kills); + isl_union_map_free(ps->tagged_dep_flow); + isl_union_map_free(ps->dep_flow); + isl_union_map_free(ps->dep_false); + isl_union_map_free(ps->dep_forced); + isl_union_map_free(ps->tagged_dep_order); + isl_union_map_free(ps->dep_order); + isl_schedule_free(ps->schedule); + isl_union_pw_multi_aff_free(ps->tagger); + isl_union_map_free(ps->independence); + isl_id_to_ast_expr_free(ps->names); + + free(ps); + + return NULL; +} + +/* Extract a ppcg_scop from a pet_scop. + * + * The constructed ppcg_scop refers to elements from the pet_scop + * so the pet_scop should not be freed before the ppcg_scop. + */ +static struct ppcg_scop *ppcg_scop_from_pet_scop(struct pet_scop *scop, + struct ppcg_options *options) +{ + int i; + isl_ctx *ctx; + struct ppcg_scop *ps; + + if (!scop) + return NULL; + + ctx = isl_set_get_ctx(scop->context); + + ps = isl_calloc_type(ctx, struct ppcg_scop); + if (!ps) + return NULL; + + ps->names = collect_names(scop); + ps->options = options; + ps->start = pet_loc_get_start(scop->loc); + ps->end = pet_loc_get_end(scop->loc); + ps->context = isl_set_copy(scop->context); + ps->context = set_intersect_str(ps->context, options->ctx); + if (options->non_negative_parameters) { + isl_space *space = isl_set_get_space(ps->context); + isl_set *nn = isl_set_nat_universe(space); + ps->context = isl_set_intersect(ps->context, nn); + } + ps->domain = collect_non_kill_domains(scop); + ps->call = collect_call_domains(scop); + ps->tagged_reads = pet_scop_get_tagged_may_reads(scop); + ps->reads = pet_scop_get_may_reads(scop); + ps->tagged_may_writes = pet_scop_get_tagged_may_writes(scop); + ps->may_writes = pet_scop_get_may_writes(scop); + ps->tagged_must_writes = pet_scop_get_tagged_must_writes(scop); + ps->must_writes = pet_scop_get_must_writes(scop); + ps->tagged_must_kills = pet_scop_get_tagged_must_kills(scop); + ps->must_kills = pet_scop_get_must_kills(scop); + ps->schedule = isl_schedule_copy(scop->schedule); + ps->pet = scop; + ps->independence = isl_union_map_empty(isl_set_get_space(ps->context)); + for (i = 0; i < scop->n_independence; ++i) + ps->independence = isl_union_map_union(ps->independence, + isl_union_map_copy(scop->independences[i]->filter)); + + compute_tagger(ps); + compute_dependences(ps); + eliminate_dead_code(ps); + + if (!ps->context || !ps->domain || !ps->call || !ps->reads || + !ps->may_writes || !ps->must_writes || !ps->tagged_must_kills || + !ps->must_kills || !ps->schedule || !ps->independence || !ps->names) + return ppcg_scop_free(ps); + + return ps; +} + +/* Internal data structure for ppcg_transform. + */ +struct ppcg_transform_data { + struct ppcg_options *options; + __isl_give isl_printer *(*transform)(__isl_take isl_printer *p, + struct ppcg_scop *scop, void *user); + void *user; +}; + +/* Should we print the original code? + * That is, does "scop" involve any data dependent conditions or + * nested expressions that cannot be handled by pet_stmt_build_ast_exprs? + */ +static int print_original(struct pet_scop *scop, struct ppcg_options *options) +{ + if (!pet_scop_can_build_ast_exprs(scop)) { + if (options->debug->verbose) + fprintf(stdout, "Printing original code because " + "some index expressions cannot currently " + "be printed\n"); + return 1; + } + + if (pet_scop_has_data_dependent_conditions(scop)) { + if (options->debug->verbose) + fprintf(stdout, "Printing original code because " + "input involves data dependent conditions\n"); + return 1; + } + + return 0; +} + +/* Callback for pet_transform_C_source that transforms + * the given pet_scop to a ppcg_scop before calling the + * ppcg_transform callback. + * + * If "scop" contains any data dependent conditions or if we may + * not be able to print the transformed program, then just print + * the original code. + */ +static __isl_give isl_printer *transform(__isl_take isl_printer *p, + struct pet_scop *scop, void *user) +{ + struct ppcg_transform_data *data = user; + struct ppcg_scop *ps; + + if (print_original(scop, data->options)) { + p = pet_scop_print_original(scop, p); + pet_scop_free(scop); + return p; + } + + scop = pet_scop_align_params(scop); + ps = ppcg_scop_from_pet_scop(scop, data->options); + + p = data->transform(p, ps, data->user); + + ppcg_scop_free(ps); + pet_scop_free(scop); + + return p; +} + +/* Transform the C source file "input" by rewriting each scop + * through a call to "transform". + * The transformed C code is written to "out". + * + * This is a wrapper around pet_transform_C_source that transforms + * the pet_scop to a ppcg_scop before calling "fn". + */ +int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out, + struct ppcg_options *options, + __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, + struct ppcg_scop *scop, void *user), void *user) +{ + struct ppcg_transform_data data = { options, fn, user }; + return pet_transform_C_source(ctx, input, out, &transform, &data); +} + +/* Check consistency of options. + * + * Return -1 on error. + */ +static int check_options(isl_ctx *ctx) +{ + struct options *options; + + options = isl_ctx_peek_options(ctx, &options_args); + if (!options) + isl_die(ctx, isl_error_internal, + "unable to find options", return -1); + + if (options->ppcg->openmp && + !isl_options_get_ast_build_atomic_upper_bound(ctx)) + isl_die(ctx, isl_error_invalid, + "OpenMP requires atomic bounds", return -1); + + return 0; +} + +#if 0 +int main(int argc, char **argv) +{ + int r; + isl_ctx *ctx; + struct options *options; + + options = options_new_with_defaults(); + assert(options); + + ctx = isl_ctx_alloc_with_options(&options_args, options); + ppcg_options_set_target_defaults(options->ppcg); + isl_options_set_ast_build_detect_min_max(ctx, 1); + isl_options_set_ast_print_macro_once(ctx, 1); + isl_options_set_schedule_whole_component(ctx, 0); + isl_options_set_schedule_maximize_band_depth(ctx, 1); + isl_options_set_schedule_maximize_coincidence(ctx, 1); + pet_options_set_encapsulate_dynamic_control(ctx, 1); + argc = options_parse(options, argc, argv, ISL_ARG_ALL); + + if (check_options(ctx) < 0) + r = EXIT_FAILURE; + else if (options->ppcg->target == PPCG_TARGET_CUDA) + r = generate_cuda(ctx, options->ppcg, options->input); + else if (options->ppcg->target == PPCG_TARGET_OPENCL) + r = generate_opencl(ctx, options->ppcg, options->input, + options->output); + else + r = generate_cpu(ctx, options->ppcg, options->input, + options->output); + + isl_ctx_free(ctx); + + return r; +} +#endif diff --git a/polly/lib/External/ppcg/ppcg_options.h b/polly/lib/External/ppcg/ppcg_options.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/ppcg_options.h @@ -0,0 +1,100 @@ +#ifndef PPCG_OPTIONS_H +#define PPCG_OPTIONS_H + +#include +#include + +struct ppcg_debug_options { + int dump_schedule_constraints; + int dump_schedule; + int dump_final_schedule; + int dump_sizes; + int verbose; +}; + +struct ppcg_options { + struct isl_options *isl; + struct ppcg_debug_options *debug; + + /* Group chains of consecutive statements before scheduling. */ + int group_chains; + + /* Use isl to compute a schedule replacing the original schedule. */ + int reschedule; + int scale_tile_loops; + int wrap; + + /* Assume all parameters are non-negative. */ + int non_negative_parameters; + char *ctx; + char *sizes; + + /* Perform tiling (C target). */ + int tile; + int tile_size; + + /* Isolate full tiles from partial tiles. */ + int isolate_full_tiles; + + /* Take advantage of private memory. */ + int use_private_memory; + + /* Take advantage of shared memory. */ + int use_shared_memory; + + /* Maximal amount of shared memory. */ + int max_shared_memory; + + /* The target we generate code for. */ + int target; + + /* Generate OpenMP macros (C target only). */ + int openmp; + + /* Linearize all device arrays. */ + int linearize_device_arrays; + + /* Allow the use of GNU extensions in generated code. */ + int allow_gnu_extensions; + + /* Allow live range to be reordered. */ + int live_range_reordering; + + /* Allow hybrid tiling whenever a suitable input pattern is found. */ + int hybrid; + + /* Unroll the code for copying to/from shared memory. */ + int unroll_copy_shared; + /* Unroll code inside tile on GPU targets. */ + int unroll_gpu_tile; + + /* Options to pass to the OpenCL compiler. */ + char *opencl_compiler_options; + /* Prefer GPU device over CPU. */ + int opencl_use_gpu; + /* Number of files to include. */ + int opencl_n_include_file; + /* Files to include. */ + const char **opencl_include_files; + /* Print definitions of types in kernels. */ + int opencl_print_kernel_types; + /* Embed OpenCL kernel code in host code. */ + int opencl_embed_kernel_code; + + /* Name of file for saving isl computed schedule or NULL. */ + char *save_schedule_file; + /* Name of file for loading schedule or NULL. */ + char *load_schedule_file; +}; + +ISL_ARG_DECL(ppcg_debug_options, struct ppcg_debug_options, + ppcg_debug_options_args) +ISL_ARG_DECL(ppcg_options, struct ppcg_options, ppcg_options_args) + +#define PPCG_TARGET_C 0 +#define PPCG_TARGET_CUDA 1 +#define PPCG_TARGET_OPENCL 2 + +void ppcg_options_set_target_defaults(struct ppcg_options *options); + +#endif diff --git a/polly/lib/External/ppcg/ppcg_options.c b/polly/lib/External/ppcg/ppcg_options.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/ppcg_options.c @@ -0,0 +1,136 @@ +/* + * Copyright 2010-2011 INRIA Saclay + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, + * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, + * 91893 Orsay, France + */ + +#include "ppcg_options.h" + +static struct isl_arg_choice target[] = { + {"c", PPCG_TARGET_C}, + {"cuda", PPCG_TARGET_CUDA}, + {"opencl", PPCG_TARGET_OPENCL}, + {0} +}; + +/* Set defaults that depend on the target. + * In particular, set --schedule-outer-coincidence iff target is a GPU. + */ +void ppcg_options_set_target_defaults(struct ppcg_options *options) +{ + char *argv[2] = { NULL }; + + argv[0] = "ppcg_options_set_target_defaults"; + if (options->target == PPCG_TARGET_C) + argv[1] = "--no-schedule-outer-coincidence"; + else + argv[1] = "--schedule-outer-coincidence"; + + isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL); +} + +/* Callback that is called whenever the "target" option is set (to "val"). + * The callback is called after target has been updated. + * + * Call ppcg_options_set_target_defaults to reset the target-dependent options. + */ +static int set_target(void *opt, unsigned val) +{ + struct ppcg_options *options = opt; + + ppcg_options_set_target_defaults(options); + + return 0; +} + +ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args) +ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0, + "dump-schedule-constraints", 0, "dump schedule constraints") +ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule, 0, + "dump-schedule", 0, "dump isl computed schedule") +ISL_ARG_BOOL(struct ppcg_debug_options, dump_final_schedule, 0, + "dump-final-schedule", 0, "dump PPCG computed schedule") +ISL_ARG_BOOL(struct ppcg_debug_options, dump_sizes, 0, + "dump-sizes", 0, + "dump effectively used per kernel tile, grid and block sizes") +ISL_ARG_BOOL(struct ppcg_debug_options, verbose, 'v', "verbose", 0, NULL) +ISL_ARGS_END + +ISL_ARGS_START(struct ppcg_options, ppcg_opencl_options_args) +ISL_ARG_STR(struct ppcg_options, opencl_compiler_options, 0, "compiler-options", + "options", NULL, "options to pass to the OpenCL compiler") +ISL_ARG_BOOL(struct ppcg_options, opencl_use_gpu, 0, "use-gpu", 1, + "use GPU device (if available)") +ISL_ARG_STR_LIST(struct ppcg_options, opencl_n_include_file, + opencl_include_files, 0, "include-file", "filename", + "file to #include in generated OpenCL code") +ISL_ARG_BOOL(struct ppcg_options, opencl_print_kernel_types, 0, + "print-kernel-types", 1, + "print definitions of types in the kernel file") +ISL_ARG_BOOL(struct ppcg_options, opencl_embed_kernel_code, 0, + "embed-kernel-code", 0, "embed kernel code into host code") +ISL_ARGS_END + +ISL_ARGS_START(struct ppcg_options, ppcg_options_args) +ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options") +ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args, + "debugging options") +ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1, + "group chains of interdependent statements that are executed " + "consecutively in the original schedule before scheduling") +ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1, + "replace original schedule by isl computed schedule") +ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0, + "scale-tile-loops", 1, NULL) +ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL) +ISL_ARG_BOOL(struct ppcg_options, use_shared_memory, 0, "shared-memory", 1, + "use shared memory in kernel code") +ISL_ARG_BOOL(struct ppcg_options, use_private_memory, 0, "private-memory", 1, + "use private memory in kernel code") +ISL_ARG_STR(struct ppcg_options, ctx, 0, "ctx", "context", NULL, + "Constraints on parameters") +ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0, + "assume-non-negative-parameters", 0, + "assume all parameters are non-negative)") +ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0, + "perform tiling (C target)") +ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL) +ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles", + 0, "isolate full tiles from partial tiles (hybrid tiling)") +ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL, + "Per kernel tile, grid and block sizes") +ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0, + "max-shared-memory", "size", 8192, "maximal amount of shared memory") +ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0, + "Generate OpenMP macros (only for C target)") +ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target, + &set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA, + "the target to generate code for") +ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0, + "linearize-device-arrays", 1, + "linearize all device arrays, even those of fixed size") +ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0, + "allow-gnu-extensions", 1, + "allow the use of GNU extensions in generated code") +ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0, + "live-range-reordering", 1, + "allow successive live ranges on the same memory element " + "to be reordered") +ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0, + "apply hybrid tiling whenever a suitable input pattern is found " + "(GPU targets)") +ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared", + 0, "unroll code for copying to/from shared memory") +ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0, + "unroll code inside tile on GPU targets") +ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options") +ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule", + "file", NULL, "save isl computed schedule to ") +ISL_ARG_STR(struct ppcg_options, load_schedule_file, 0, "load-schedule", + "file", NULL, "load schedule from , " + "using it instead of an isl computed schedule") +ISL_ARGS_END diff --git a/polly/lib/External/ppcg/print.h b/polly/lib/External/ppcg/print.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/print.h @@ -0,0 +1,40 @@ +#ifndef PRINT_H +#define PRINT_H + +#include + +#include "ppcg.h" + +extern const char *ppcg_min; +extern const char *ppcg_max; +extern const char *ppcg_fdiv_q; + +__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p); +__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p); + +__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p); +__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p, + const char *min, const char *max); +__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type, + __isl_take isl_printer *p); +__isl_give isl_printer *ppcg_ast_expr_print_macros( + __isl_keep isl_ast_expr *expr, __isl_take isl_printer *p); +__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p, + __isl_keep isl_id_to_ast_expr *ref2expr); +__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node); + +__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size, + __isl_keep isl_ast_build *build); + +__isl_give isl_printer *ppcg_print_declaration_with_size( + __isl_take isl_printer *p, const char *base_type, + __isl_keep isl_ast_expr *size); +__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p, + struct pet_array *array, __isl_keep isl_ast_build *build); +__isl_give isl_printer *ppcg_print_exposed_declarations( + __isl_take isl_printer *p, struct ppcg_scop *scop); +__isl_give isl_printer *ppcg_print_hidden_declarations( + __isl_take isl_printer *p, struct ppcg_scop *scop); + +#endif diff --git a/polly/lib/External/ppcg/print.c b/polly/lib/External/ppcg/print.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/print.c @@ -0,0 +1,461 @@ +/* + * Copyright 2012-2013 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France + */ + +#include +#include +#include + +#include "print.h" +#include "util.h" + +__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "{"); + p = isl_printer_end_line(p); + p = isl_printer_indent(p, 2); + return p; +} + +__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p) +{ + p = isl_printer_indent(p, -2); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "}"); + p = isl_printer_end_line(p); + return p; +} + +/* Names of notes that keep track of whether min/max + * macro definitions have already been printed. + */ +static const char *ppcg_max_printed = "ppcg_max_printed"; +static const char *ppcg_min_printed = "ppcg_min_printed"; + +/* Has the macro definition corresponding to "note_name" been printed + * to "p" before? + * That is, does "p" have an associated "note_name" note? + */ +static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name) +{ + isl_ctx *ctx; + isl_id *id; + isl_bool printed; + + if (!p) + return isl_bool_error; + + ctx = isl_printer_get_ctx(p); + id = isl_id_alloc(ctx, note_name, NULL); + printed = isl_printer_has_note(p, id); + isl_id_free(id); + + return printed; +} + +/* Keep track of the fact that the macro definition corresponding + * to "note_name" has been printed to "p" by attaching a note with + * that name. The value of the note is of no importance, but it + * has to be a valid isl_id, so the note identifier is reused + * as the note. + */ +static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p, + const char *note_name) +{ + isl_ctx *ctx; + isl_id *id; + + if (!p) + return NULL; + + ctx = isl_printer_get_ctx(p); + id = isl_id_alloc(ctx, note_name, NULL); + return isl_printer_set_note(p, id, isl_id_copy(id)); +} + +/* Print a macro definition "def" for the macro "name" to "p", + * unless such a macro definition has been printed to "p" before. + * "note_name" is used as the name of the note that keeps track + * of whether this printing has happened. + */ +static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p, + const char *name, const char *def, const char *note_name) +{ + isl_bool printed; + + printed = printed_before(p, note_name); + if (printed < 0) + return isl_printer_free(p); + if (printed) + return p; + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "#define "); + p = isl_printer_print_str(p, name); + p = isl_printer_print_str(p, def); + p = isl_printer_end_line(p); + + p = mark_printed(p, note_name); + + return p; +} + +/* Structure for keeping track of definitions of some macros. + */ +struct ppcg_macros { + const char *min; + const char *max; +}; + +/* Free the memory allocated by a struct ppcg_macros. + */ +static void ppcg_macros_free(void *user) +{ + free(user); +} + +/* Default macro definitions (when GNU extensions are allowed). + */ +struct ppcg_macros ppcg_macros_default = { + .min = "(x,y) " + "({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); " + "_x < _y ? _x : _y; })", + .max = "(x,y) " + "({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); " + "_x > _y ? _x : _y; })", +}; + +/* Name used for the note that keeps track of macro definitions. + */ +static const char *ppcg_macros = "ppcg_macros"; + +/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max + * to "min" and "max" and store them in "p". + * + * In particular, create a ppcg_macros object and attach it + * as a note to the printer. + */ +__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p, + const char *min, const char *max) +{ + isl_ctx *ctx; + isl_id *id, *macros_id; + struct ppcg_macros *macros; + + if (!p) + return NULL; + + ctx = isl_printer_get_ctx(p); + macros = isl_alloc_type(ctx, struct ppcg_macros); + if (!macros) + return isl_printer_free(p); + macros->min = min; + macros->max = max; + id = isl_id_alloc(ctx, ppcg_macros, NULL); + macros_id = isl_id_alloc(ctx, NULL, macros); + if (!macros_id) + ppcg_macros_free(macros); + else + macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free); + + p = isl_printer_set_note(p, id, macros_id); + + return p; +} + +/* Return the ppcg_macros object that holds the currently active + * macro definitions in "p". + * If "p" has a note with macro definitions, then return those. + * Otherwise, return the default macro definitions. + */ +static struct ppcg_macros *get_macros(__isl_keep isl_printer *p) +{ + isl_id *id; + isl_bool has_macros; + struct ppcg_macros *macros; + + id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL); + has_macros = isl_printer_has_note(p, id); + if (has_macros < 0 || !has_macros) { + isl_id_free(id); + if (has_macros < 0) + return NULL; + return &ppcg_macros_default; + } + id = isl_printer_get_note(p, id); + macros = isl_id_get_user(id); + isl_id_free(id); + + return macros; +} + +/* Print the currently active macro definition for ppcg_max. + */ +static __isl_give isl_printer *print_max(__isl_take isl_printer *p) +{ + struct ppcg_macros *macros; + + macros = get_macros(p); + if (!macros) + return isl_printer_free(p); + return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed); +} + +/* Print the currently active macro definition for ppcg_min. + */ +static __isl_give isl_printer *print_min(__isl_take isl_printer *p) +{ + struct ppcg_macros *macros; + + macros = get_macros(p); + if (!macros) + return isl_printer_free(p); + return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed); +} + +/* Print a macro definition for "type" to "p". + * If GNU extensions are allowed, then print a specialized definition + * for isl_ast_op_min and isl_ast_op_max. + * Otherwise, use the default isl definition. + */ +__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type, + __isl_take isl_printer *p) +{ + isl_ctx *ctx; + struct ppcg_options *options; + + if (!p) + return NULL; + + ctx = isl_printer_get_ctx(p); + options = isl_ctx_peek_options(ctx, &ppcg_options_args); + if (!options || !options->allow_gnu_extensions) + return isl_ast_op_type_print_macro(type, p); + + switch (type) { + case isl_ast_op_max: + return print_max(p); + case isl_ast_op_min: + return print_min(p); + default: + return isl_ast_op_type_print_macro(type, p); + } +} + +/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type + * callback that prints a macro definition for "type". + */ +static isl_stat print_macro(enum isl_ast_op_type type, void *user) +{ + isl_printer **p = user; + + *p = ppcg_print_macro(type, *p); + if (!*p) + return isl_stat_error; + + return isl_stat_ok; +} + +/* Print the required macros for "expr". + */ +__isl_give isl_printer *ppcg_ast_expr_print_macros( + __isl_keep isl_ast_expr *expr, __isl_take isl_printer *p) +{ + if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0) + return isl_printer_free(p); + return p; +} + +/* isl_id_to_ast_expr_foreach callback that prints the required + * macro definitions for "val". + */ +static isl_stat print_expr_macros(__isl_take isl_id *key, + __isl_take isl_ast_expr *val, void *user) +{ + isl_printer **p = user; + + *p = ppcg_ast_expr_print_macros(val, *p); + isl_id_free(key); + isl_ast_expr_free(val); + + if (!*p) + return isl_stat_error; + return isl_stat_ok; +} + +/* Print the required macro definitions for the body of a statement in which + * the access expressions are replaced by the isl_ast_expr objects + * in "ref2expr". + */ +__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p, + __isl_keep isl_id_to_ast_expr *ref2expr) +{ + if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0) + return isl_printer_free(p); + return p; +} + +/* Print the required macros for "node". + */ +__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node) +{ + if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0) + return isl_printer_free(p); + return p; +} + +/* Names used for the macros that may appear in a printed isl AST. + */ +const char *ppcg_min = "ppcg_min"; +const char *ppcg_max = "ppcg_max"; +const char *ppcg_fdiv_q = "ppcg_fdiv_q"; + +/* Set the names of the macros that may appear in a printed isl AST. + */ +__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p) +{ + p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min); + p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max); + p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q); + + return p; +} + +/* Given a multi affine expression "mpa" without domain, modify it to have + * the schedule space of "build" as domain. + * + * If the schedule space of "build" is a parameter space, then nothing + * needs to be done. + * Otherwise, "mpa" is first given a 0D domain and then it is combined + * with a mapping from the schedule space of "build" to the same 0D domain. + */ +__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff( + __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build) +{ + isl_bool params; + isl_space *space; + isl_multi_aff *ma; + + space = isl_ast_build_get_schedule_space(build); + params = isl_space_is_params(space); + if (params < 0 || params) { + isl_space_free(space); + if (params < 0) + return isl_multi_pw_aff_free(mpa); + return mpa; + } + space = isl_space_from_domain(space); + ma = isl_multi_aff_zero(space); + mpa = isl_multi_pw_aff_from_range(mpa); + mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma); + + return mpa; +} + +/* Build an access AST expression from "size" using "build". + * "size" does not have a domain, but "build" may have a proper schedule space. + * First modify "size" to have that schedule space as domain. + */ +__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size, + __isl_keep isl_ast_build *build) +{ + size = ppcg_attach_multi_pw_aff(size, build); + return isl_ast_build_access_from_multi_pw_aff(build, size); +} + +/* Print a declaration for an array with element type "base_type" and + * size "size" to "p". + */ +__isl_give isl_printer *ppcg_print_declaration_with_size( + __isl_take isl_printer *p, const char *base_type, + __isl_keep isl_ast_expr *size) +{ + if (!base_type || !size) + return isl_printer_free(p); + + p = ppcg_ast_expr_print_macros(size, p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, base_type); + p = isl_printer_print_str(p, " "); + p = isl_printer_print_ast_expr(p, size); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +/* Print a declaration for array "array" to "p", using "build" + * to simplify any size expressions. + * + * The size is computed from the extent of the array and is + * subsequently converted to an "access expression" by "build". + */ +__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p, + struct pet_array *array, __isl_keep isl_ast_build *build) +{ + isl_multi_pw_aff *size; + isl_ast_expr *expr; + + if (!array) + return isl_printer_free(p); + + size = ppcg_size_from_extent(isl_set_copy(array->extent)); + expr = isl_ast_build_access_from_multi_pw_aff(build, size); + p = ppcg_print_declaration_with_size(p, array->element_type, expr); + isl_ast_expr_free(expr); + + return p; +} + +/* Print declarations for the arrays in "scop" that are declared + * and that are exposed (if exposed == 1) or not exposed (if exposed == 0). + */ +static __isl_give isl_printer *print_declarations(__isl_take isl_printer *p, + struct ppcg_scop *scop, int exposed) +{ + int i; + isl_ast_build *build; + + if (!scop) + return isl_printer_free(p); + + build = isl_ast_build_from_context(isl_set_copy(scop->context)); + for (i = 0; i < scop->pet->n_array; ++i) { + struct pet_array *array = scop->pet->arrays[i]; + + if (!array->declared) + continue; + if (array->exposed != exposed) + continue; + + p = ppcg_print_declaration(p, array, build); + } + isl_ast_build_free(build); + + return p; +} + +/* Print declarations for the arrays in "scop" that are declared + * and exposed to the code after the scop. + */ +__isl_give isl_printer *ppcg_print_exposed_declarations( + __isl_take isl_printer *p, struct ppcg_scop *scop) +{ + return print_declarations(p, scop, 1); +} + +/* Print declarations for the arrays in "scop" that are declared, + * but not exposed to the code after the scop. + */ +__isl_give isl_printer *ppcg_print_hidden_declarations( + __isl_take isl_printer *p, struct ppcg_scop *scop) +{ + return print_declarations(p, scop, 0); +} diff --git a/polly/lib/External/ppcg/schedule.h b/polly/lib/External/ppcg/schedule.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/schedule.h @@ -0,0 +1,21 @@ +#ifndef _SCHEDULE_H +#define _SCHEDULE_H + +#include +#include +#include +#include + +#include "ppcg_options.h" + +__isl_give isl_set *parametrization(__isl_take isl_space *space, + int len, int first, __isl_keep isl_id_list *names); + +__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx, + struct ppcg_options *options, + __isl_give isl_schedule *(*compute)(void *user), void *user); + +__isl_give isl_schedule_node *ppcg_set_schedule_node_type( + __isl_take isl_schedule_node *node, enum isl_ast_loop_type type); + +#endif diff --git a/polly/lib/External/ppcg/schedule.c b/polly/lib/External/ppcg/schedule.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/schedule.c @@ -0,0 +1,165 @@ +/* + * Copyright 2010-2011 INRIA Saclay + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, + * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, + * 91893 Orsay, France + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "schedule.h" + +/* Add parameters with identifiers "ids" to "set". + */ +static __isl_give isl_set *add_params(__isl_take isl_set *set, + __isl_keep isl_id_list *ids) +{ + int i, n; + unsigned nparam; + + n = isl_id_list_n_id(ids); + + nparam = isl_set_dim(set, isl_dim_param); + set = isl_set_add_dims(set, isl_dim_param, n); + + for (i = 0; i < n; ++i) { + isl_id *id; + + id = isl_id_list_get_id(ids, i); + set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id); + } + + return set; +} + +/* Equate the dimensions of "set" starting at "first" to + * freshly created parameters with identifiers "ids". + * The number of equated dimensions is equal to the number of elements in "ids". + */ +static __isl_give isl_set *parametrize(__isl_take isl_set *set, + int first, __isl_keep isl_id_list *ids) +{ + int i, n; + unsigned nparam; + + nparam = isl_set_dim(set, isl_dim_param); + + set = add_params(set, ids); + + n = isl_id_list_n_id(ids); + for (i = 0; i < n; ++i) + set = isl_set_equate(set, isl_dim_param, nparam + i, + isl_dim_set, first + i); + + return set; +} + +/* Given a parameter space "space", create a set of dimension "len" + * of which the dimensions starting at "first" are equated to + * freshly created parameters with identifiers "ids". + */ +__isl_give isl_set *parametrization(__isl_take isl_space *space, + int len, int first, __isl_keep isl_id_list *ids) +{ + isl_set *set; + + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, len); + set = isl_set_universe(space); + + return parametrize(set, first, ids); +} + +/* Load and return a schedule from a file called "filename". + */ +static __isl_give isl_schedule *load_schedule(isl_ctx *ctx, + const char *filename) +{ + FILE *file; + isl_schedule *schedule; + + file = fopen(filename, "r"); + if (!file) { + fprintf(stderr, "Unable to open '%s' for reading\n", filename); + return NULL; + } + schedule = isl_schedule_read_from_file(ctx, file); + fclose(file); + + return schedule; +} + +/* Save the schedule "schedule" to a file called "filename". + * The schedule is printed in block style. + */ +static void save_schedule(__isl_keep isl_schedule *schedule, + const char *filename) +{ + FILE *file; + isl_ctx *ctx; + isl_printer *p; + + if (!schedule) + return; + + file = fopen(filename, "w"); + if (!file) { + fprintf(stderr, "Unable to open '%s' for writing\n", filename); + return; + } + ctx = isl_schedule_get_ctx(schedule); + p = isl_printer_to_file(ctx, file); + p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); + p = isl_printer_print_schedule(p, schedule); + isl_printer_free(p); + fclose(file); +} + +/* Obtain a schedule, either by reading it form a file + * or by computing it using "compute". + * Also take care of saving the computed schedule and/or + * dumping the obtained schedule if requested by the user. + */ +__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx, + struct ppcg_options *options, + __isl_give isl_schedule *(*compute)(void *user), void *user) +{ + isl_schedule *schedule; + + if (options->load_schedule_file) { + schedule = load_schedule(ctx, options->load_schedule_file); + } else { + schedule = compute(user); + if (options->save_schedule_file) + save_schedule(schedule, options->save_schedule_file); + } + if (options->debug->dump_schedule) + isl_schedule_dump(schedule); + + return schedule; +} + +/* Mark all dimensions in the band node "node" to be of "type". + */ +__isl_give isl_schedule_node *ppcg_set_schedule_node_type( + __isl_take isl_schedule_node *node, enum isl_ast_loop_type type) +{ + int i, n; + + n = isl_schedule_node_band_n_member(node); + for (i = 0; i < n; ++i) + node = isl_schedule_node_band_member_set_ast_loop_type(node, i, + type); + + return node; +} diff --git a/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c b/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c @@ -0,0 +1,49 @@ +#include + +int main() +{ + int A[2][1000][1000]; + int B[2][1000][1000]; + +#pragma scop + { + for (int i = 0; i < 256; ++i) + for (int j = 0; j < 256; ++j) + if (j % 8 <= 2 || j % 8 >= 6) + A[1][i][j] = B[1][j][i]; + } +#pragma endscop + +/* + +When compiled with: + +./ppcg tests/allow-sparse-copy-in.c --no-linearize-device-arrays + --on-error=abort --sizes='{kernel[i]->tile[8,8]; kernel[i]->block[1,8]}' + --max-shared-memory=-1 --unroll-copy-shared + +this originally resulted in the following copy-in code: + + shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1]; + shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1]; + shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1]; + shared_B[0][3][t1] = B[1][8 * b1 + 3][8 * b0 + t1]; + shared_B[0][4][t1] = B[1][8 * b1 + 4][8 * b0 + t1]; + shared_B[0][5][t1] = B[1][8 * b1 + 5][8 * b0 + t1]; + shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1]; + shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1]; + +whereas we only want to only perform copies that are actually needed: + + shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1]; + shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1]; + shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1]; + shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1]; + shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1]; +*/ + for (int i = 0; i < 100; ++i) + if (A[1][0][i] != i) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/call.c b/polly/lib/External/ppcg/tests/call.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/call.c @@ -0,0 +1,29 @@ +#include + +void copy_summary(int b[1000], int a[1000], int pos) +{ + b[pos] = 0; + int c = a[pos]; +} + +#ifdef pencil_access +__attribute__((pencil_access(copy_summary))) +#endif +void copy(int b[1000], int a[1000], int pos); + +int main() +{ + int a[1000], b[1000]; + + for (int i = 0; i < 1000; ++i) + a[i] = i; +#pragma scop + for (int i = 0; i < 1000; ++i) + copy(b, a, i); +#pragma endscop + for (int i = 0; i < 1000; ++i) + if (b[i] != a[i]) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/call2.c b/polly/lib/External/ppcg/tests/call2.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/call2.c @@ -0,0 +1,29 @@ +#include + +void copy_summary(int b[1000], int a[1000], int pos) +{ + b[pos] = 0; + int c = a[pos]; +} + +#ifdef pencil_access +__attribute__((pencil_access(copy_summary))) +#endif +void copy(int b[1000], int a[1000], int pos); + +int main() +{ + int a[2][1000]; + + for (int i = 0; i < 1000; ++i) + a[0][i] = i; +#pragma scop + for (int i = 0; i < 1000; ++i) + copy(a[1], a[0], i); +#pragma endscop + for (int i = 0; i < 1000; ++i) + if (a[1][i] != a[0][i]) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/call2_opencl_functions.cl b/polly/lib/External/ppcg/tests/call2_opencl_functions.cl new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/call2_opencl_functions.cl @@ -0,0 +1,4 @@ +void copy(__global int b[1000], __global int a[1000], int pos) +{ + b[pos] = a[pos]; +} diff --git a/polly/lib/External/ppcg/tests/call3.c b/polly/lib/External/ppcg/tests/call3.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/call3.c @@ -0,0 +1,32 @@ +#include + +void copy_summary(int b[100], int a[100]) +{ + for (int i = 0; i < 100; ++i) { + b[i] = 0; + int c = a[i]; + } +} + +#ifdef pencil_access +__attribute__((pencil_access(copy_summary))) +#endif +void copy(int b[100], int a[100]); + +int main() +{ + int A[100][100], B[100]; + + for (int i = 0; i < 100; ++i) + B[i] = i; +#pragma scop + for (int i = 0; i < 100; ++i) + copy(A[i], B); +#pragma endscop + for (int i = 0; i < 100; ++i) + for (int j = 0; j < 100; ++j) + if (A[j][i] != B[i]) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/call3_opencl_functions.cl b/polly/lib/External/ppcg/tests/call3_opencl_functions.cl new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/call3_opencl_functions.cl @@ -0,0 +1,5 @@ +void copy(__global int b[100], __global int a[100]) +{ + for (int i = 0; i < 100; ++i) + b[i] = a[i]; +} diff --git a/polly/lib/External/ppcg/tests/call_opencl_functions.cl b/polly/lib/External/ppcg/tests/call_opencl_functions.cl new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/call_opencl_functions.cl @@ -0,0 +1,4 @@ +void copy(__global int b[1000], __global int a[1000], int pos) +{ + b[pos] = a[pos]; +} diff --git a/polly/lib/External/ppcg/tests/dead.c b/polly/lib/External/ppcg/tests/dead.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/dead.c @@ -0,0 +1,23 @@ +#include + +int main() +{ + int a[1000], b[1000]; + + for (int i = 0; i < 1000; ++i) + a[i] = i; +#pragma scop + for (int i = 0; i < 1000; ++i) { + int c; + int d; + c = a[i]; + d = c; + b[i] = c; + } +#pragma endscop + for (int i = 0; i < 1000; ++i) + if (b[i] != a[i]) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/iterator.c b/polly/lib/External/ppcg/tests/iterator.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/iterator.c @@ -0,0 +1,18 @@ +#include + +int main() +{ + int i; + int a[101]; + + i = 0; +#pragma scop + for (i = 0; i < 100; ++i) + a[i] = i; + a[i] = i; +#pragma endscop + if (a[100] != 100) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/live_out.c b/polly/lib/External/ppcg/tests/live_out.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/live_out.c @@ -0,0 +1,22 @@ +#include + +/* Check that a write access is not removed from the live-out + * accesses only because a strict subset of the (potentially) + * accessed elements are killed by a later write. + */ +int main() +{ + int A[10]; + + A[1] = 0; +#pragma scop + int i = 1; + i = i * i; + A[i] = 1; + A[0] = 0; +#pragma endscop + if (A[1] != 1) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/local.c b/polly/lib/External/ppcg/tests/local.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/local.c @@ -0,0 +1,22 @@ +#include + +int main() +{ + int A[100]; + +#pragma scop + { + int B[100]; + B[0] = 0; + for (int i = 1; i < 100; ++i) + B[i] = B[i - 1] + 1; + for (int i = 0; i < 100; ++i) + A[i] = B[i]; + } +#pragma endscop + for (int i = 0; i < 100; ++i) + if (A[i] != i) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/loop.c b/polly/lib/External/ppcg/tests/loop.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/loop.c @@ -0,0 +1,18 @@ +#include + +int main() +{ + int a[1000], b[1000]; + + for (int i = 0; i < 1000; ++i) + a[i] = i; +#pragma scop + for (int i = 0; i < 1000; ++i) + b[i] = a[i]; +#pragma endscop + for (int i = 0; i < 1000; ++i) + if (b[i] != a[i]) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/not_accessed.c b/polly/lib/External/ppcg/tests/not_accessed.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/not_accessed.c @@ -0,0 +1,29 @@ +#include + +void copy_summary(int b[1000], int a[1000], int pos, int c[1000]) +{ + b[pos] = 0; + int d = a[pos]; +} + +#ifdef pencil_access +__attribute__((pencil_access(copy_summary))) +#endif +void copy(int b[1000], int a[1000], int pos, int c[1000]); + +int main() +{ + int a[1000], b[1000], c[1000]; + + for (int i = 0; i < 1000; ++i) + a[i] = i; +#pragma scop + for (int i = 0; i < 1000; ++i) + copy(b, a, i, c); +#pragma endscop + for (int i = 0; i < 1000; ++i) + if (b[i] != a[i]) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl b/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl @@ -0,0 +1,5 @@ +void copy(__global int b[1000], __global int a[1000], int pos, + __global int c[1000]) +{ + b[pos] = a[pos]; +} diff --git a/polly/lib/External/ppcg/tests/scalar.c b/polly/lib/External/ppcg/tests/scalar.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/scalar.c @@ -0,0 +1,13 @@ +#include + +int main() +{ + int a; +#pragma scop + a = 1; +#pragma endscop + if (a != 1) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/shared_sink.c b/polly/lib/External/ppcg/tests/shared_sink.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/shared_sink.c @@ -0,0 +1,25 @@ +#include + +/* Check that the sources of live ranges with the same sink + * are executed in order. + */ +int main() +{ + int A[128]; + int n = 128; + + A[0] = 0; +#pragma scop + for (int i = 0; i < n; ++i) { + int set = 0; + if (A[i] < 2) + set = 1; + if (set) + A[i] = 2; + } +#pragma endscop + if (A[0] != 2) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/struct.c b/polly/lib/External/ppcg/tests/struct.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/struct.c @@ -0,0 +1,31 @@ +#include + +struct s { + int c[10][10]; +}; + +int main() +{ + struct s a[10][10], b[10][10]; + + for (int i = 0; i < 10; ++i) + for (int j = 0; j < 10; ++j) + for (int k = 0; k < 10; ++k) + for (int l = 0; l < 10; ++l) + a[i][j].c[k][l] = i + j + k + l; +#pragma scop + for (int i = 0; i < 10; ++i) + for (int j = 0; j < 10; ++j) + for (int k = 0; k < 10; ++k) + for (int l = 0; l < 10; ++l) + b[i][j].c[k][l] = i + j + k + l; +#pragma endscop + for (int i = 0; i < 10; ++i) + for (int j = 0; j < 10; ++j) + for (int k = 0; k < 10; ++k) + for (int l = 0; l < 10; ++l) + if (b[i][j].c[k][l] != a[i][j].c[k][l]) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/struct2.c b/polly/lib/External/ppcg/tests/struct2.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/struct2.c @@ -0,0 +1,21 @@ +#include + +struct s { + int a; +}; + +int main() +{ + struct s a, b[10]; + +#pragma scop + a.a = 42; + for (int i = 0; i < 10; ++i) + b[i].a = a.a; +#pragma endscop + for (int i = 0; i < 10; ++i) + if (b[i].a != 42) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/struct3.c b/polly/lib/External/ppcg/tests/struct3.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/struct3.c @@ -0,0 +1,25 @@ +#include + +struct s { + int a; + int b; +}; + +int main() +{ + struct s a, b[10]; + + a.b = 57; +#pragma scop + a.a = 42; + for (int i = 0; i < 10; ++i) + b[i] = a; +#pragma endscop + for (int i = 0; i < 10; ++i) + if (b[i].a != 42) + return EXIT_FAILURE; + if (a.b != 57) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/tests/struct4.c b/polly/lib/External/ppcg/tests/struct4.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/tests/struct4.c @@ -0,0 +1,27 @@ +#include + +struct s { + int a; + int b; +}; + +int main() +{ + int a[10]; + + for (int i = 0; i < 10; ++i) + a[i] = 0; +#pragma scop + for (int i = 0; i < 10; ++i) { + struct s b; + b.a = 1; + b.b = i; + a[i] = b.a + b.b; + } +#pragma endscop + for (int i = 0; i < 10; ++i) + if (a[i] != 1 + i) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/polly/lib/External/ppcg/util.h b/polly/lib/External/ppcg/util.h new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/util.h @@ -0,0 +1,22 @@ +#ifndef UTIL_H +#define UTIL_H + +#include + +#include +#include + +/* Compare the prefix of "s" to "prefix" up to the length of "prefix". + */ +static inline int prefixcmp(const char *s, const char *prefix) +{ + return strncmp(s, prefix, strlen(prefix)); +} + +__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space, + int val); +__isl_give isl_multi_val *ppcg_multi_val_from_int_list( + __isl_take isl_space *space, int *list); +__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set); + +#endif diff --git a/polly/lib/External/ppcg/util.c b/polly/lib/External/ppcg/util.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/util.c @@ -0,0 +1,105 @@ +/* + * Copyright 2012-2013 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include +#include +#include +#include + +#include "util.h" + +/* Construct an isl_multi_val living in "space" with all values equal to "val". + */ +__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space, + int val) +{ + int i, n; + isl_ctx *ctx; + isl_val *v; + isl_multi_val *mv; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + n = isl_space_dim(space, isl_dim_set); + mv = isl_multi_val_zero(space); + v = isl_val_int_from_si(ctx, val); + for (i = 0; i < n; ++i) + mv = isl_multi_val_set_val(mv, i, isl_val_copy(v)); + isl_val_free(v); + + return mv; +} + +/* Construct an isl_multi_val living in "space" with values specified + * by "list". "list" is assumed to have at least as many entries + * as the set dimension of "space". + */ +__isl_give isl_multi_val *ppcg_multi_val_from_int_list( + __isl_take isl_space *space, int *list) +{ + int i, n; + isl_ctx *ctx; + isl_multi_val *mv; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + n = isl_space_dim(space, isl_dim_set); + mv = isl_multi_val_zero(space); + for (i = 0; i < n; ++i) { + isl_val *v; + + v = isl_val_int_from_si(ctx, list[i]); + mv = isl_multi_val_set_val(mv, i, v); + } + + return mv; +} + +/* Compute the size of a bounding box around the origin and "set", + * where "set" is assumed to contain only non-negative elements. + * In particular, compute the maximal value of "set" in each direction + * and add one. + */ +__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set) +{ + int i, n; + isl_multi_pw_aff *mpa; + + n = isl_set_dim(set, isl_dim_set); + mpa = isl_multi_pw_aff_zero(isl_set_get_space(set)); + for (i = 0; i < n; ++i) { + isl_space *space; + isl_aff *one; + isl_pw_aff *bound; + + if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) { + const char *name; + name = isl_set_get_tuple_name(set); + if (!name) + name = ""; + fprintf(stderr, "unable to determine extent of '%s' " + "in dimension %d\n", name, i); + set = isl_set_free(set); + } + bound = isl_set_dim_max(isl_set_copy(set), i); + + space = isl_pw_aff_get_domain_space(bound); + one = isl_aff_zero_on_domain(isl_local_space_from_space(space)); + one = isl_aff_add_constant_si(one, 1); + bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one)); + mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound); + } + isl_set_free(set); + + return mpa; +} diff --git a/polly/lib/External/ppcg/version.c b/polly/lib/External/ppcg/version.c new file mode 100644 --- /dev/null +++ b/polly/lib/External/ppcg/version.c @@ -0,0 +1,6 @@ +#include "gitversion.h" + +const char *ppcg_version(void) +{ + return GIT_HEAD_ID"\n"; +} diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp --- a/polly/lib/Support/RegisterPasses.cpp +++ b/polly/lib/Support/RegisterPasses.cpp @@ -219,6 +219,14 @@ void initializePollyPasses(llvm::PassRegistry &Registry) { initializeCodeGenerationPass(Registry); +#ifdef GPU_CODEGEN + initializePPCGCodeGenerationPass(Registry); + initializeManagedMemoryRewritePassPass(Registry); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); +#endif initializeCodePreparationPass(Registry); initializeDeadCodeElimWrapperPassPass(Registry); initializeDependenceInfoPass(Registry); diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -711,6 +711,11 @@ function_ref GetDeps, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, isl::schedule &LastSchedule, bool &DepsChanged) { + + // Skip SCoPs in case they're already optimised by PPCGCodeGeneration + if (S.isToBeSkipped()) + return; + // Skip empty SCoPs but still allow code generation as it will delete the // loops present but not needed. if (S.getSize() == 0) { diff --git a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll b/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll @@ -0,0 +1,9 @@ +define float @__nv_expf(float %a) { + ret float %a +} +define float @__nv_cosf(float %a) { + ret float %a +} +define float @__nv_logf(float %a) { + ret float %a +} diff --git a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll @@ -0,0 +1,71 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; Check that we detect a scop. +; SCOP: Function: checkScalarKill +; SCOP-NEXT: Region: %XLoopInit---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; Check that we have a scalar that is not a phi node in the scop. +; SCOP: i32 MemRef_x_0; // Element size 4 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + +; Check that we add variables that are local to a scop into the kills that we +; pass to PPCG. This should enable PPCG to codegen this example. +; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) { +; int x; +; #pragma scop +; for(int i = 0; i < 1000; i++) { +; XLoopInit: x = 0; +; +; if (control1 > 2) +; C1Add: x += 10; +; if (control2 > 3) +; C2Add: x += A[i]; +; +; BLoopAccumX: B[i] += x; +; } +; +; #pragma endscop +; } +; ModuleID = 'test.ll' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %XLoopInit + +XLoopInit: ; preds = %entry.split, %BLoopAccumX + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ] + %cmp1 = icmp sgt i32 %control1, 2 + %x.0 = select i1 %cmp1, i32 10, i32 0 + %cmp2 = icmp sgt i32 %control2, 3 + br i1 %cmp2, label %C2Add, label %BLoopAccumX + +C2Add: ; preds = %XLoopInit + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %tmp6 = load i32, ptr %arrayidx, align 4 + %add4 = add nsw i32 %tmp6, %x.0 + br label %BLoopAccumX + +BLoopAccumX: ; preds = %XLoopInit, %C2Add + %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ] + %arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %tmp11 = load i32, ptr %arrayidx7, align 4 + %add8 = add nsw i32 %tmp11, %x.1 + store i32 %add8, ptr %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %XLoopInit, label %for.end + +for.end: ; preds = %BLoopAccumX + ret void +} diff --git a/polly/test/GPGPU/align-params-in-schedule.ll b/polly/test/GPGPU/align-params-in-schedule.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/align-params-in-schedule.ll @@ -0,0 +1,53 @@ +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \ +; RUN: FileCheck %s + +; REQUIRES: pollyacc + +; CHECK: polly_launchKernel + +; Verify that this program compiles. At some point, this compilation crashed +; due to insufficient parameters being available. + +source_filename = "bugpoint-output-4d01492.bc" +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] } +%struct.widget = type { i64, i64, i64 } + +@global = external unnamed_addr global %struct.barney, align 32 + +; Function Attrs: nounwind uwtable +define void @wobble(ptr noalias %arg) #0 { +bb: + %tmp = load i32, ptr %arg, align 4 + br label %bb1 + +bb1: ; preds = %bb13, %bb + %tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ] + br label %bb3 + +bb3: ; preds = %bb3, %bb1 + %tmp4 = load ptr, ptr @global, align 32 + %tmp5 = sext i32 %tmp2 to i64 + %tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8 + %tmp7 = mul i64 %tmp6, %tmp5 + %tmp8 = add i64 %tmp7, 0 + %tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8 + %tmp10 = add i64 %tmp8, %tmp9 + %tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10 + store i32 undef, ptr %tmp11, align 4 + %tmp12 = icmp eq i32 0, 0 + br i1 %tmp12, label %bb13, label %bb3 + +bb13: ; preds = %bb3 + %tmp14 = icmp eq i32 %tmp2, %tmp + %tmp15 = add i32 %tmp2, 1 + br i1 %tmp14, label %bb16, label %bb1 + +bb16: ; preds = %bb13 + ret void +} + +attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll b/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll @@ -0,0 +1,50 @@ +; RUN: opt %loadPolly -S -polly-codegen-ppcg \ +; RUN: -polly-use-llvm-names < %s +; ModuleID = 'test/GPGPU/zero-size-array.ll' + +; REQUIRES: pollyacc + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + + +; We used to divide the element size by 8 to arrive at the 'actual' size +; of an array element. This used to cause arrays that have an element size +; of less than 8 to collapse to size 0. This test makes sure that it does +; not happen anymore. + +; f(int *niters_ptr, int *arr[0]) { +; const int inters = *niters_ptr; +; for(int i = 0; i < niters; i++) { +; arr[0][i + 1] = 0 +; } +; } + +; Function Attrs: nounwind uwtable +define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 { +entry: + %niters = load i32, ptr %niters.ptr, align 4 + br label %loop.body + +loop.body: ; preds = %loop.body, %entry + %indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ] + %indvar.sext = sext i32 %indvar to i64 + %arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext + store i32 0, ptr %arr.slot, align 4 + %tmp8 = icmp eq i32 %indvar, %niters + %indvar.next = add i32 %indvar, 1 + br i1 %tmp8, label %loop.exit, label %loop.body + +loop.exit: ; preds = %loop.body + %tmp10 = icmp sgt i32 undef, 0 + br label %auxiliary.loop + +auxiliary.loop: ; preds = %"101", %loop.exit + %tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ] + br i1 undef, label %auxiliary.loop, label %exit + +exit: ; preds = %auxiliary.loop + ret void +} + +attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll b/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll @@ -0,0 +1,55 @@ +; RUN: opt %loadPolly -S -polly-codegen-ppcg \ +; RUN: -polly-ignore-parameter-bounds \ +; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR +; +; REQUIRES: pollyacc + +; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain +; all the parameters present in the program. +; +; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff` +; to have the same parameter dimensions. To achieve this, we used to realign +; every `pw_aff` with `Scop::Context`. However, in conjunction with +; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context` +; does not contain all parameters. +; +; We check that Polly does the right thing in this case and sets up the parameter +; dimensions correctly. + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) +; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll' + +; C pseudocode +; ------------ +; void f(int *arr, long niters, long stride) { +; for(int i = 0; i < niters; i++) { +; arr[i * stride] = 1; +; } +; } + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 { +entry: + br label %loop + +loop: ; preds = %loop, %entry + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ] + %idx = mul nuw nsw i64 %indvar, %stride + %slot = getelementptr i32, ptr %arr, i64 %idx + store i32 1, ptr %slot, align 4 + %indvar.next = add nuw nsw i64 %indvar, 1 + %check = icmp sgt i64 %indvar.next, %niters + br i1 %check, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind uwtable } diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/cuda-annotations.ll @@ -0,0 +1,37 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; REQUIRES: pollyacc + +; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 { + +; KERNEL: !nvvm.annotations = !{!0} + +; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1} + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A, i64 %n) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] + %tmp = icmp slt i64 %i.0, %n + br i1 %tmp, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0 + %tmp4 = load i64, ptr %tmp3, align 8 + %tmp5 = add nsw i64 %tmp4, 100 + store i64 %tmp5, ptr %tmp3, align 8 + br label %bb6 + +bb6: ; preds = %bb2 + %tmp7 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll @@ -0,0 +1,118 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \ +; RUN: FileCheck %s + +; REQUIRES: pollyacc + +; +; #include +; +; static const int N = 45; +; +; void copy(int *R, int *A) { +; for (int i = 0; i < N; i++) { +; R[i] = A[i] * 10; +; } +; } +; +; int main() { +; int *A, *R; +; +; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal); +; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal); +; +; for (int i = 0; i < N; i++) { +; A[i] = i; +; R[i] = 0; +; } +; copy(R, A); +; +; return 0; +; } +; + +; CHECK-NOT: polly_copyFromHostToDevice +; CHECK-NOT: polly_copyFromDeviceToHost +; CHECK-NOT: polly_freeDeviceMemory +; CHECK-NOT: polly_allocateMemoryForDevice + +; CHECK: %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA() +; CHECK-NEXT: %[[REGCA:[0-9]+]] = bitcast i32* %A to i8* +; CHECK-NEXT: %[[REGCR:[0-9]+]] = bitcast i32* %R to i8* +; CHECK-NEXT: %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; CHECK-NEXT: store i8* %[[REGCA]], i8** %polly_launch_0_param_0 +; CHECK-NEXT: %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8* +; CHECK-NEXT: store i8* %[[REGCP0]], i8** %[[REGGEP0]] +; CHECK-NEXT: %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; CHECK-NEXT: store i8* %[[REGCR]], i8** %polly_launch_0_param_1 +; CHECK-NEXT: %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8* +; CHECK-NEXT: store i8* %[[REGCP1]], i8** %[[REGGEP1]] +; CHECK-NEXT: %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0)) +; CHECK-NEXT: call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; CHECK-NEXT: call void @polly_freeKernel(i8* %[[REGKERNEL]]) +; CHECK-NEXT: call void @polly_synchronizeDevice() +; CHECK-NEXT: call void @polly_freeContext(i8* %[[REGCTX]]) + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @copy(i32* %R, i32* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 45 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %tmp, 10 + %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define i32 @main() { +entry: + %A = alloca i32*, align 8 + %R = alloca i32*, align 8 + %tmp = bitcast i32** %A to i8** + %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2 + %tmp1 = bitcast i32** %R to i8** + %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 45 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %tmp2 = load i32*, i32** %A, align 8 + %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv + %tmp3 = trunc i64 %indvars.iv to i32 + store i32 %tmp3, i32* %arrayidx, align 4 + %tmp4 = load i32*, i32** %R, align 8 + %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv + store i32 0, i32* %arrayidx3, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %tmp5 = load i32*, i32** %R, align 8 + %tmp6 = load i32*, i32** %A, align 8 + call void @copy(i32* %tmp5, i32* %tmp6) + ret i32 0 +} + +declare i32 @cudaMallocManaged(i8**, i64, i32) #1 diff --git a/polly/test/GPGPU/debug-metadata-leak.ll b/polly/test/GPGPU/debug-metadata-leak.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/debug-metadata-leak.ll @@ -0,0 +1,104 @@ +; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: | FileCheck --check-prefix=KERNEL-IR %s + +; REQUIRES: pollyacc + +; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 { + +; The instruction marked <<>> is copied into the GPUModule, +; with changes only to the parameters to access data on the device instead of +; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the +; instruction is annotated with a DILocation, copying the instruction also copies +; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by +; failing the verification of the Module in GPUNodeBuilder::finalize, due to the +; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied +; nor created. +; +; https://reviews.llvm.org/D35630 removes this debug metadata before the +; instruction is copied to the GPUModule. +; +; vec_add_1.c: +; void vec_add_1(int N, int arr[N]) { +; int i=0; +; for( i=0 ; i>> + store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27 + br label %for.inc, !dbg !25 + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31 + call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19 + br label %for.cond, !dbg !32, !llvm.loop !33 + +for.end: ; preds = %for.cond + ret void, !dbg !35 +} + +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 5.0.0"} +!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10, !11} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64) +!12 = !{!13, !14, !15} +!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11) +!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10) +!16 = !DIExpression() +!17 = !DILocation(line: 1, column: 20, scope: !7) +!18 = !DILocation(line: 1, column: 27, scope: !7) +!19 = !DILocation(line: 2, column: 7, scope: !7) +!20 = !DILocation(line: 3, column: 8, scope: !21) +!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) +!22 = !DILocation(line: 3, column: 15, scope: !23) +!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3) +!24 = !DILocation(line: 3, column: 3, scope: !21) +!25 = !DILocation(line: 3, column: 25, scope: !23) +!26 = !DILocation(line: 3, column: 32, scope: !23) +!27 = !{!28, !28, i64 0} +!28 = !{!"int", !29, i64 0} +!29 = !{!"omnipotent char", !30, i64 0} +!30 = !{!"Simple C/C++ TBAA"} +!31 = !DILocation(line: 3, column: 21, scope: !23) +!32 = !DILocation(line: 3, column: 3, scope: !23) +!33 = distinct !{!33, !24, !34} +!34 = !DILocation(line: 3, column: 35, scope: !21) +!35 = !DILocation(line: 4, column: 1, scope: !7) diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/double-parallel-loop.ll @@ -0,0 +1,254 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=SCHED %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-IR + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-ASM + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today due to extensive output differences from when the test was written. + +; CHECK: Stmt_bb5 +; CHECK-NEXT: Domain := +; CHECK-NEXT: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }; +; CHECK-NEXT: Schedule := +; CHECK-NEXT: { Stmt_bb5[i0, i1] -> [i0, i1] }; +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] }; +; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] }; + +; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }" +; SCHED-NEXT: child: +; SCHED-NEXT: context: "{ [] }" +; SCHED-NEXT: child: +; SCHED-NEXT: extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }" +; SCHED-NEXT: child: +; SCHED-NEXT: sequence: +; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }" +; SCHED-NEXT: child: +; SCHED-NEXT: set: +; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }" +; SCHED-NEXT: child: +; SCHED-NEXT: guard: "{ [] }" +; SCHED-NEXT: - filter: "{ Stmt_bb5[i0, i1] }" +; SCHED-NEXT: child: +; SCHED-NEXT: guard: "{ [] }" +; SCHED-NEXT: child: +; SCHED-NEXT: mark: "kernel" +; SCHED-NEXT: child: +; SCHED-NEXT: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }" +; SCHED-NEXT: child: +; SCHED-NEXT: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }" +; SCHED-NEXT: child: +; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]" +; SCHED-NEXT: permutable: 1 +; SCHED-NEXT: coincident: [ 1, 1 ] +; SCHED-NEXT: child: +; SCHED-NEXT: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }" +; SCHED-NEXT: child: +; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]" +; SCHED-NEXT: permutable: 1 +; SCHED-NEXT: coincident: [ 1, 1 ] +; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }" +; SCHED-NEXT: child: +; SCHED-NEXT: set: +; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }" +; SCHED-NEXT: child: +; SCHED-NEXT: guard: "{ [] }" + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(16, 32); +; CODE-NEXT: dim3 k0_dimGrid(32, 32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) +; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3); + +; IR: polly.split_new_and_old: +; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024) +; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1 +; IR-NEXT: %polly.overflow.state = or i1 false, %.obit +; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0 +; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024) +; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1 +; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1 +; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0 +; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3) +; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1 +; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4 +; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0 +; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6) +; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1 +; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7 +; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0 +; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440 +; IR-NEXT: %5 = and i1 true, %4 +; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true +; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown +; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2 + +; IR: polly.start: +; IR-NEXT: br label %polly.acc.initialize + +; IR: polly.acc.initialize: +; IR-NEXT: [[GPUContext:%.*]] = call ptr @polly_initContext() +; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304) +; IR-NEXT: call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304) +; IR-NEXT: [[DevPtr:%.*]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A) +; IR-NEXT: store ptr [[DevPtr]], ptr %polly_launch_0_param_0 +; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params +; IR-NEXT: call ptr @polly_getKernel +; IR-NEXT: call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr) +; IR-NEXT: call void @polly_freeKernel +; IR-NEXT: call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304) +; IR-NEXT: call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A) +; IR-NEXT: call void @polly_freeContext(ptr [[GPUContext]]) +; IR-NEXT: br label %polly.exiting + +; IR: polly.exiting: +; IR-NEXT: br label %polly.merge_new_and_old + +; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 { +; KERNEL-IR-NEXT: entry: +; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64 +; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() +; KERNEL-IR-NEXT: %b1 = zext i32 %1 to i64 +; KERNEL-IR-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; KERNEL-IR-NEXT: %t0 = zext i32 %2 to i64 +; KERNEL-IR-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() +; KERNEL-IR-NEXT: %t1 = zext i32 %3 to i64 +; KERNEL-IR-NEXT: br label %polly.loop_preheader + +; KERNEL-IR-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 +; KERNEL-IR-NEXT: ret void + +; KERNEL-IR-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader +; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] +; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0 +; KERNEL-IR-NEXT: %6 = mul nsw i64 32, %b1 +; KERNEL-IR-NEXT: %7 = add nsw i64 %6, %t1 +; KERNEL-IR-NEXT: %8 = mul nsw i64 16, %polly.indvar +; KERNEL-IR-NEXT: %9 = add nsw i64 %7, %8 +; KERNEL-IR-NEXT: br label %polly.stmt.bb5 + +; KERNEL-IR-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header +; KERNEL-IR-NEXT: %10 = mul i64 %5, %9 +; KERNEL-IR-NEXT: %p_tmp6 = sitofp i64 %10 to float +; KERNEL-IR-NEXT: %11 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %12 = add nsw i64 %11, %t0 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024 +; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b1 +; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t1 +; KERNEL-IR-NEXT: %15 = mul nsw i64 16, %polly.indvar +; KERNEL-IR-NEXT: %16 = add nsw i64 %14, %15 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16 +; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A +; KERNEL-IR-NEXT: %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4 +; KERNEL-IR-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 +; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024 +; KERNEL-IR-NEXT: %19 = mul nsw i64 32, %b1 +; KERNEL-IR-NEXT: %20 = add nsw i64 %19, %t1 +; KERNEL-IR-NEXT: %21 = mul nsw i64 16, %polly.indvar +; KERNEL-IR-NEXT: %22 = add nsw i64 %20, %21 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22 +; KERNEL-IR-NEXT: %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3 +; KERNEL-IR-NEXT: store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4 +; KERNEL-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 +; KERNEL-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 0 +; KERNEL-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit + +; KERNEL-IR-LABEL: polly.loop_preheader: ; preds = %entry +; KERNEL-IR-NEXT: br label %polly.loop_header + +; KERNEL-IR: attributes #0 = { "polly.skip.fn" } + +; KERNEL-ASM: .version 3.2 +; KERNEL-ASM-NEXT: .target sm_30 +; KERNEL-ASM-NEXT: .address_size 64 + +; KERNEL-ASM: // .globl kernel_0 + +; KERNEL-ASM: .visible .entry kernel_0( +; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0 +; KERNEL-ASM-NEXT: ) + +; void double_parallel_loop(float A[][1024]) { +; for (long i = 0; i < 1024; i++) +; for (long j = 0; j < 1024; j++) +; A[i][j] += i * j; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @double_parallel_loop(ptr %A) { +bb: + br label %bb2 + +bb2: ; preds = %bb13, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] + %exitcond1 = icmp ne i64 %i.0, 1024 + br i1 %exitcond1, label %bb3, label %bb15 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb10, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] + %exitcond = icmp ne i64 %j.0, 1024 + br i1 %exitcond, label %bb5, label %bb12 + +bb5: ; preds = %bb4 + %tmp = mul nuw nsw i64 %i.0, %j.0 + %tmp6 = sitofp i64 %tmp to float + %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0 + %tmp8 = load float, ptr %tmp7, align 4 + %tmp9 = fadd float %tmp8, %tmp6 + store float %tmp9, ptr %tmp7, align 4 + br label %bb10 + +bb10: ; preds = %bb5 + %tmp11 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb12: ; preds = %bb4 + br label %bb13 + +bb13: ; preds = %bb12 + %tmp14 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb15: ; preds = %bb2 + ret void +} diff --git a/polly/test/GPGPU/failing-invariant-load-handling.ll b/polly/test/GPGPU/failing-invariant-load-handling.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/failing-invariant-load-handling.ll @@ -0,0 +1,57 @@ +; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN + +; REQUIRES: pollyacc + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" + +%S = type { i32, i32, [12 x %L] } +%L = type { i32, i32, double, i32, i32, i32, i32, i32 } + +define void @test(ptr %cpi, i1 %b) { +; SCOPS-LABEL: Region: %if.then14---%exit +; SCOPS: Invariant Accesses: { +; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] }; +; SCOPS-NEXT: Execution Context: [l2, l1] -> { : } +; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] }; +; SCOPS-NEXT: Execution Context: [l2, l1] -> { : l2 > 0 } +; SCOPS-NEXT: } +; SCOPS: Arrays { +; SCOPS-NEXT: i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4 +; SCOPS-NEXT: } + +; Check that we gracefully handle failing invariant loads. +; This test case is taken from: +; test/Isl/CodeGen/invariant-load-dimension.ll + +; FIXME: Figure out how to actually generate code for this loop. +; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function + +entry: + %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1 + br i1 %b, label %if.then14, label %exit + +if.then14: + %l0 = load i32, ptr %cpi, align 8 + %cmp12.i = icmp sgt i32 %l0, 0 + br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit + +for.body.lr.ph.i: + %l1 = load i32, ptr %nt, align 4 + br label %for.body.i + +for.body.i: + %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ] + %mul.i163 = mul nsw i32 %phi, %l1 + %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0 + store i32 0, ptr %cv, align 8 + %inc = add nuw nsw i32 %phi, 1 + %l2 = load i32, ptr %cpi, align 8 + %cmp.i164 = icmp slt i32 %inc, %l2 + br i1 %cmp.i164, label %for.body.i, label %exit + +exit: + ret void +} diff --git a/polly/test/GPGPU/failing-invariant-load-hoisting.ll b/polly/test/GPGPU/failing-invariant-load-hoisting.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/failing-invariant-load-hoisting.ll @@ -0,0 +1,41 @@ +; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN + +; REQUIRES: pollyacc + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" + +%S = type { i32, i32, [12 x %L] } +%L = type { i32, i32, double, i32, i32, i32, i32, i32 } + +define void @test(ptr %cpi, i1 %b) { +; CODEGEN-LABEL: @test( +; CODEGEN: polly.preload.begin: +; CODEGEN-NEXT: br i1 false + +entry: + %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1 + br i1 %b, label %if.then14, label %exit + +if.then14: + %l0 = load i32, ptr %cpi, align 8 + %cmp12.i = icmp sgt i32 %l0, 0 + br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit + +for.body.lr.ph.i: + %l1 = load i32, ptr %nt, align 4 + br label %for.body.i + +for.body.i: + %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ] + %mul.i163 = mul nsw i32 %phi, %l1 + %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0 + store i32 0, ptr %cv, align 8 + %inc = add nuw nsw i32 %phi, 1 + %l2 = load i32, ptr %cpi, align 8 + %cmp.i164 = icmp slt i32 %inc, %l2 + br i1 %cmp.i164, label %for.body.i, label %exit + +exit: + ret void +} diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/host-control-flow.ll @@ -0,0 +1,176 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \ +; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \ +; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ +; RUN: -S < %s | FileCheck %s -check-prefix=IR +; void foo(float A[2][100]) { +; for (long t = 0; t < 100; t++) +; for (long i = 1; i < 99; i++) +; A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1]; +; } + +; REQUIRES: pollyacc + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1) +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(4); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, c0); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); +; CODE-NEXT: } + +; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader +; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] +; ... +; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* +; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] +; IR: call i8* @polly_getKernel +; ... +; IR: call void @polly_freeKernel +; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 +; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99 +; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit + +; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0) +; KERNEL-IR-LABEL: entry: +; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64 +; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; KERNEL-IR-NEXT: %t0 = zext i32 %1 to i64 +; KERNEL-IR-NEXT: br label %polly.cond + +; KERNEL-IR-LABEL: polly.cond: ; preds = %entry +; KERNEL-IR-NEXT: %2 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %3 = add nsw i64 %2, %t0 +; KERNEL-IR-NEXT: %4 = icmp sle i64 %3, 97 +; KERNEL-IR-NEXT: br i1 %4, label %polly.then, label %polly.else + +; KERNEL-IR-LABEL: polly.merge: ; preds = %polly.else, %polly.stmt.for.body3 +; KERNEL-IR-NEXT: ret void + +; KERNEL-IR-LABEL: polly.then: ; preds = %polly.cond +; KERNEL-IR-NEXT: %5 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %6 = add nsw i64 %5, %t0 +; KERNEL-IR-NEXT: br label %polly.stmt.for.body3 + +; KERNEL-IR-LABEL: polly.stmt.for.body3: ; preds = %polly.then +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-IR-NEXT: %pexp.pdiv_r = urem i64 %c0, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100 +; KERNEL-IR-NEXT: %7 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %8 = add nsw i64 %7, %t0 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8 +; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A +; KERNEL-IR-NEXT: %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4 +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-IR-NEXT: %pexp.pdiv_r2 = urem i64 %c0, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100 +; KERNEL-IR-NEXT: %9 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %10 = add nsw i64 %9, %t0 +; KERNEL-IR-NEXT: %11 = add nsw i64 %10, 1 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11 +; KERNEL-IR-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4 +; KERNEL-IR-NEXT: %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4 +; KERNEL-IR-NEXT: %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_ +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-IR-NEXT: %pexp.pdiv_r7 = urem i64 %c0, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100 +; KERNEL-IR-NEXT: %12 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %13 = add nsw i64 %12, %t0 +; KERNEL-IR-NEXT: %14 = add nsw i64 %13, 2 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14 +; KERNEL-IR-NEXT: %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9 +; KERNEL-IR-NEXT: %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4 +; KERNEL-IR-NEXT: %p_add12 = fadd float %p_add, %tmp3_p_scalar_ +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-IR-NEXT: %15 = add nsw i64 %c0, 1 +; KERNEL-IR-NEXT: %pexp.pdiv_r12 = urem i64 %15, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100 +; KERNEL-IR-NEXT: %16 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %17 = add nsw i64 %16, %t0 +; KERNEL-IR-NEXT: %18 = add nsw i64 %17, 1 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18 +; KERNEL-IR-NEXT: %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14 +; KERNEL-IR-NEXT: %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4 +; KERNEL-IR-NEXT: %p_add17 = fadd float %tmp4_p_scalar_, %p_add12 +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-IR-NEXT: %19 = add nsw i64 %c0, 1 +; KERNEL-IR-NEXT: %pexp.pdiv_r17 = urem i64 %19, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100 +; KERNEL-IR-NEXT: %20 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %21 = add nsw i64 %20, %t0 +; KERNEL-IR-NEXT: %22 = add nsw i64 %21, 1 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22 +; KERNEL-IR-NEXT: %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19 +; KERNEL-IR-NEXT: store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4 +; KERNEL-IR-NEXT: br label %polly.merge + +; KERNEL-IR-LABEL: polly.else: ; preds = %polly.cond +; KERNEL-IR-NEXT: br label %polly.merge +; KERNEL-IR-NEXT: } + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo([100 x float]* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc18, %entry + %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ] + %exitcond1 = icmp ne i64 %t.0, 100 + br i1 %exitcond1, label %for.body, label %for.end20 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i64 %i.0, 99 + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %sub = add nsw i64 %i.0, -1 + %rem = srem i64 %t.0, 2 + %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub + %tmp = load float, float* %arrayidx4, align 4 + %rem5 = srem i64 %t.0, 2 + %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0 + %tmp2 = load float, float* %arrayidx7, align 4 + %add = fadd float %tmp, %tmp2 + %add8 = add nuw nsw i64 %i.0, 1 + %rem9 = srem i64 %t.0, 2 + %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8 + %tmp3 = load float, float* %arrayidx11, align 4 + %add12 = fadd float %add, %tmp3 + %add13 = add nuw nsw i64 %t.0, 1 + %rem14 = srem i64 %add13, 2 + %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0 + %tmp4 = load float, float* %arrayidx16, align 4 + %add17 = fadd float %tmp4, %add12 + store float %add17, float* %arrayidx16, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nuw nsw i64 %i.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc18 + +for.inc18: ; preds = %for.end + %inc19 = add nuw nsw i64 %t.0, 1 + br label %for.cond + +for.end20: ; preds = %for.cond + ret void +} diff --git a/polly/test/GPGPU/host-statement.ll b/polly/test/GPGPU/host-statement.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/host-statement.ll @@ -0,0 +1,204 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -polly-invariant-load-hoisting=false \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -polly-invariant-load-hoisting=false \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=KERNEL-IR %s + +; REQUIRES: pollyacc + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @llvm.lifetime.start(i64, ptr nocapture) #0 + +; This test case tests that we can correctly handle a ScopStmt that is +; scheduled on the host, instead of within a kernel. + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(16); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: if (p_0 <= 510 && p_1 <= 510) { +; CODE-NEXT: { +; CODE-NEXT: dim3 k1_dimBlock(32); +; CODE-NEXT: dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k2_dimBlock(16, 32); +; CODE-NEXT: dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16); +; CODE-NEXT: kernel2 <<>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: } +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); +; CODE-NEXT: Stmt_for_cond33_preheader_last(); + +; CODE: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_for_body16(32 * b0 + t0); + +; CODE: # kernel1 +; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1) +; CODE-NEXT: for (int c1 = 0; c1 <= 15; c1 += 1) { +; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0) +; CODE-NEXT: Stmt_for_body35(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510) +; CODE-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) +; CODE-NEXT: Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3); +; CODE-NEXT: sync0(); +; CODE-NEXT: } + +; CODE: # kernel2 +; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1) +; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510) +; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) +; CODE-NEXT: Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3); + +; KERNEL-IR: call void @llvm.nvvm.barrier0() + +; Function Attrs: nounwind uwtable +define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry.split, %for.inc86 + %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ] + %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ] + br label %for.inc + +for.inc: ; preds = %for.cond1.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ] + %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ] + %arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24 + %tmp = load double, ptr %arrayidx5, align 8, !tbaa !1 + %arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24 + %tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1 + %mul = fmul double %tmp, %tmp27 + %add = fadd double %nrm.02, %mul + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 512 + br i1 %exitcond, label %for.inc, label %for.end + +for.end: ; preds = %for.inc + %add.lcssa = phi double [ %add, %for.inc ] + %call = tail call double @sqrt(double %add.lcssa) #2 + %arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24 + store double %call, ptr %arrayidx13, align 8, !tbaa !1 + br label %for.body16 + +for.cond33.preheader: ; preds = %for.body16 + %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1 + %cmp347 = icmp slt i64 %indvars.iv.next25, 512 + br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86 + +for.body35.lr.ph: ; preds = %for.cond33.preheader + br label %for.body35 + +for.body16: ; preds = %for.end, %for.body16 + %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ] + %arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24 + %tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1 + %arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24 + %tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1 + %div = fdiv double %tmp28, %tmp29 + %arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24 + store double %div, ptr %arrayidx28, align 8, !tbaa !1 + %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 + %exitcond12 = icmp ne i64 %indvars.iv.next11, 512 + br i1 %exitcond12, label %for.body16, label %for.cond33.preheader + +for.cond33.loopexit: ; preds = %for.body62 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32 + %exitcond23 = icmp ne i32 %lftr.wideiv, 512 + br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge + +for.body35: ; preds = %for.body35.lr.ph, %for.cond33.loopexit + %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ] + %arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 + store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1 + br label %for.body42 + +for.cond60.preheader: ; preds = %for.body42 + br label %for.body62 + +for.body42: ; preds = %for.body35, %for.body42 + %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ] + %arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24 + %tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1 + %arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21 + %tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1 + %mul51 = fmul double %tmp30, %tmp31 + %arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 + %tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1 + %add56 = fadd double %tmp32, %mul51 + store double %add56, ptr %arrayidx55, align 8, !tbaa !1 + %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 + %exitcond15 = icmp ne i64 %indvars.iv.next14, 512 + br i1 %exitcond15, label %for.body42, label %for.cond60.preheader + +for.body62: ; preds = %for.cond60.preheader, %for.body62 + %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ] + %arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21 + %tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1 + %arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24 + %tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1 + %arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 + %tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1 + %mul75 = fmul double %tmp34, %tmp35 + %sub = fsub double %tmp33, %mul75 + %arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21 + store double %sub, ptr %arrayidx79, align 8, !tbaa !1 + %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 + %exitcond18 = icmp ne i64 %indvars.iv.next17, 512 + br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit + +for.cond33.for.inc86_crit_edge: ; preds = %for.cond33.loopexit + br label %for.inc86 + +for.inc86: ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond26 = icmp ne i64 %indvars.iv.next25, 512 + br i1 %exitcond26, label %for.cond1.preheader, label %for.end88 + +for.end88: ; preds = %for.inc86 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, ptr nocapture) #0 + +; Function Attrs: nounwind +declare double @sqrt(double) #2 + +attributes #0 = { argmemonly nounwind } +attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"double", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/ignore-parameter-bounds.ll b/polly/test/GPGPU/ignore-parameter-bounds.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/ignore-parameter-bounds.ll @@ -0,0 +1,41 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; REQUIRES: pollyacc + +; CODE: Code +; CODE: ==== +; CODE: No code generated + +source_filename = "bugpoint-output-83bcdeb.bc" +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@__data_radiation_MOD_cobi = external global [168 x double], align 32 + +; Function Attrs: nounwind uwtable +define void @__radiation_rg_MOD_coe_so() #0 { +entry: + %polly.access.kspec.load = load i32, ptr undef, align 4 + %0 = or i1 undef, undef + br label %polly.preload.cond29 + +polly.preload.cond29: ; preds = %entry + br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30 + +polly.preload.merge30: ; preds = %polly.preload.exec31, %polly.preload.cond29 + %polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ] + ret void + +polly.preload.exec31: ; preds = %polly.preload.cond29 + %1 = sext i32 %polly.access.kspec.load to i64 + %2 = mul nsw i64 7, %1 + %3 = add nsw i64 0, %2 + %4 = add nsw i64 %3, 48 + %polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4 + %polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8 + br label %polly.preload.merge30 +} + +attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll @@ -0,0 +1,76 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR +; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR + +; Test that we do recognise and codegen a kernel that has intrinsics. + +; REQUIRES: pollyacc + +; Check that we model the kernel as a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end + +; Check that the intrinsic call is present in the kernel IR. +; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) +; KERNEL-IR: declare float @llvm.sqrt.f32(float) +; KERNEL-IR: declare float @llvm.fabs.f32(float) + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + + +; void f(float *A, float *B, int N) { +; for(int i = 0; i < N; i++) { +; float tmp0 = A[i]; +; float tmp1 = sqrt(tmp1); +; float tmp2 = fabs(tmp2); +; float tmp3 = copysignf(tmp1, tmp2); +; B[i] = tmp4; +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, float* %B, i32 %N) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv + %A.arr.i.val = load float, float* %A.arr.i, align 4 + ; Call to intrinsics that should be part of the kernel. + %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) + %fabs = tail call float @llvm.fabs.f32(float %sqrt); + %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs); + %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv + store float %copysign, float* %B.arr.i, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %N to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.copysign.f32(float, float) #0 + +attributes #0 = { nounwind readnone } + diff --git a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll @@ -0,0 +1,47 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ +; RUN: -disable-output < %s + +; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually +; fail on an illegal module. + +; REQUIRES: pollyacc, asserts +; XFAIL: * +; +; void foo(long A[1024], long B[1024]) { +; for (long i = 0; i < 1024; i++) +; A[i] += (B[i] + (long)&B[i]); +; } + + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A, ptr %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb10, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb12 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0 + %tmp3 = load i64, ptr %tmp, align 8 + %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0 + %tmp5 = ptrtoint ptr %tmp4 to i64 + %tmp6 = add nsw i64 %tmp3, %tmp5 + %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0 + %tmp8 = load i64, ptr %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, ptr %tmp7, align 8 + br label %bb10 + +bb10: ; preds = %bb2 + %tmp11 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb12: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/invalid-kernel.ll b/polly/test/GPGPU/invalid-kernel.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invalid-kernel.ll @@ -0,0 +1,73 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: not FileCheck %s -check-prefix=KERNEL-IR + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR + +; REQUIRES: pollyacc +; +; void foo(long A[1024], long B[1024]) { +; for (long i = 0; i < 1024; i++) +; A[i] += (B[i] + (long)&B[i]); +; } + +; This kernel loads/stores a pointer address we model. This is a rare case, +; were we still lack proper code-generation support. We check here that we +; detect the invalid IR and bail out gracefully. + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_B, dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR + +; KERNEL-IR: kernel + +; IR: br i1 false, label %polly.start, label %bb1 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A, ptr %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb10, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb12 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0 + %tmp3 = load i64, ptr %tmp, align 8 + %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0 + %tmp5 = ptrtoint ptr %tmp4 to i64 + %tmp6 = add nsw i64 %tmp3, %tmp5 + %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0 + %tmp8 = load i64, ptr %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, ptr %tmp7, align 8 + br label %bb10 + +bb10: ; preds = %bb2 + %tmp11 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb12: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/invariant-load-array-access.ll b/polly/test/GPGPU/invariant-load-array-access.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-array-access.ll @@ -0,0 +1,70 @@ +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + + +; REQUIRES: pollyacc + +; Check that we detect a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] }; +; SCOP-NEXT: Execution Context: [tmp] -> { : } +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] }; +; SCOP-NEXT: Execution Context: [tmp] -> { : tmp >= 4 } +; SCOP-NEXT: } + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + +; This test makes sure that such an access pattern is handled correctly +; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads` +; was the main reason that caused this test case to crash. +; +; void f(int *arr, const int *control, const int *readarr) { +; for(int i = 0; i < 1000; i++) { +; int t = 0; +; if (*control > 3) { +; t += *readarr; +; } +; arr[i] = t; +; } +; } + + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.12.0" +define void @f(ptr %arr, ptr %control, ptr %readarr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ] + %tmp = load i32, ptr %control, align 4 + %cmp1 = icmp sgt i32 %tmp, 3 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %tmp1 = load i32, ptr %readarr, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01 + store i32 %t.0, ptr %arrayidx, align 4 + %inc = add nuw nsw i32 %i.01, 1 + %exitcond = icmp eq i32 %inc, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %if.end + ret void +} diff --git a/polly/test/GPGPU/invariant-load-escaping-values.ll b/polly/test/GPGPU/invariant-load-escaping-values.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-escaping-values.ll @@ -0,0 +1,30 @@ +; RUN: opt %loadPolly -S -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s + +; REQUIRES: pollyacc + +; CHECK: store i64 %polly.access.B.load, ptr %invariant.preload.s2a +; CHECK: %invariant.final_reload = load i64, ptr %invariant.preload.s2a + +; Verify that the final reload of an invariant scalar memory access uses the +; same stack slot that into which the invariant memory access was stored +; originally. Earlier, this was broken as we introduce a new stack slot aside +; of the preload stack slot, which remained uninitialized and caused our escaping +; loads to contain garbage. + +define i64 @foo(ptr %A, ptr %B) { +entry: + br label %loop + +loop: + %indvar = phi i64 [0, %entry], [%indvar.next, %loop] + %indvar.next = add nsw i64 %indvar, 1 + %idx = getelementptr float, ptr %A, i64 %indvar + store float 42.0, ptr %idx + %invariant = load i64, ptr %B + %cmp = icmp sle i64 %indvar, 1024 + br i1 %cmp, label %loop, label %exit + +exit: + ret i64 %invariant +} diff --git a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll b/polly/test/GPGPU/invariant-load-hoisting-of-array.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-hoisting-of-array.ll @@ -0,0 +1,101 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP + +; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; Entry: Contains (%loaded.ptr.preload.s2a = alloca double*) which is +; | invariant load hoisted `%loaded.ptr` +; v +; Run-time check --(failure branch)--> { old code - contains `%loaded.ptr` } +; | +; (success branch) +; | +; v +; New Code: Should refer to `%loaded.ptr.preload.s2a`, which is +; the invariant load hoisted value, NOT `%loaded.ptr`. + +; In Polly, we preserve the old code and create a separate branch that executes +; the GPU code if a run-time check succeeds. + +; We need to make sure that in the new branch, we pick up invariant load hoisted +; values. The old values will belong to the old code branch. + +; In this case, we use to try to load the 'original' %loaded.ptr in the +; 'New Code' branch,which is wrong. Check that this does not happen. + +; Check that we have a Scop with an invariant load of the array. +; SCOP: Function: f +; SCOP-NEXT: Region: %arrload---%for.exit +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: { Stmt_arrload[] -> MemRef_arr_of_ptrs[0] }; + + + +; Check that we have the preloaded array. +; HOST-IR: entry: +; HOST-IR-NEXT: %loaded.ptr.preload.s2a = alloca double* + +; Chek that we store the correct value in the preload. +; polly.preload.begin: ; preds = %polly.split_new_and_old +; HOST-IR: %polly.access.arr.of.ptrs = getelementptr double*, double** %arr.of.ptrs, i64 0 +; HOST-IR-NEXT: %polly.access.arr.of.ptrs.load = load double*, double** %polly.access.arr.of.ptrs +; HOST-IR-NEXT: store double* %polly.access.arr.of.ptrs.load, double** %loaded.ptr.preload.s2a + +; Check that we get back data from the kernel. +; HOST-IR: polly.acc.initialize: ; preds = %polly.start +; HOST-IR: [[FIRSTINDEX:%.+]] = getelementptr double, double* %polly.access.arr.of.ptrs.load, i64 1 +; HOST-IR: [[BITCASTED:%.+]] = bitcast double* [[FIRSTINDEX]] to i8* +; HOST-IR: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_loaded_ptr, i8* [[BITCASTED]], i64 800) + +; Check that the kernel launch is generated in the host IR. +; This declaration would not have been generated unless a kernel launch exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + + +; C pseudocode equivalent +; void f(double **arr_of_ptrs) { +; double *loaded_ptr = arr_of_ptrs[0]; +; if (false) { return; } +; else { +; for(int i = 1; i < 100; i++) { +; loaded_ptr[i] = 42.0; +; } +; } +; } + + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + + +; Function Attrs: nounwind uwtable +define void @f(double **%arr.of.ptrs) #0 { +entry: + br label %arrload + +arrload: ; preds = %"7" + %loaded.ptr = load double*, double** %arr.of.ptrs, align 8 + br i1 false, label %"for.exit", label %"for.preheader" + +"for.preheader": ; preds = %"51" + br label %"for.body" + +"for.body": ; preds = %"53", %"53.lr.ph" + %indvar = phi i64 [ 1, %"for.preheader" ], [ %indvar.next, %"for.body" ] + %slot = getelementptr double, double* %loaded.ptr, i64 %indvar + store double 42.0, double* %slot, align 8 + + %indvar.next = add nuw nsw i64 %indvar, 1 + + %check = icmp sgt i64 %indvar.next, 100 + br i1 %check, label %"for.exit", label %"for.body" + +"for.exit": ; preds = %"52.54_crit_edge", %"51" + ret void +} + +attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll b/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll @@ -0,0 +1,47 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ +; RUN: -S < %s | \ +; RUN: FileCheck -check-prefix=HOST-IR %s + +; RUN: opt %loadPolly -disable-output -polly-acc-dump-kernel-ir \ +; RUN: -polly-codegen-ppcg -polly-scops \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck -check-prefix=KERNEL-IR %s + +; REQUIRES: pollyacc + +; Verify that invariant loads used in a kernel statement are correctly forwarded +; as subtree value to the GPU kernel. + +; HOST-IR: store float %polly.access.p.load, ptr %invariant.preload.s2a, align 4 + +; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2({{.*}}ptr addrspace(1) %MemRef_indvar2f__phi{{.*}}) +; KERNEL-IR: %indvar2f.phiops.reload = load float, ptr %indvar2f.phiops, align 4 +; KERNEL-IR: store float %indvar2f.phiops.reload, ptr addrspace(1) %polly.access.MemRef_A, align 4 + +; FIXME: store float %indvar2f.phiops.reload, ptr %indvar2f.phiops, align 4 +; For some reason the above instruction is emitted that stores back to the addess it was just loaded from. + +define void @foo(ptr %A, ptr %p) { +entry: + br label %loop + +loop: + %indvar = phi i64 [0, %entry], [%indvar.next, %loop] + %indvar.next = add i64 %indvar, 1 + %invariant = load float, ptr %p + %ptr = getelementptr float, ptr %A, i64 %indvar + store float 42.0, ptr %ptr + %cmp = icmp sle i64 %indvar, 1024 + br i1 %cmp, label %loop, label %anotherloop + +anotherloop: + %indvar2 = phi i64 [0, %loop], [%indvar2.next, %anotherloop] + %indvar2f = phi float [%invariant, %loop], [%indvar2f, %anotherloop] + %indvar2.next = add i64 %indvar2, 1 + store float %indvar2f, ptr %A + %cmp2 = icmp sle i64 %indvar2, 1024 + br i1 %cmp2, label %anotherloop, label %end + +end: + ret void + +} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll @@ -0,0 +1,62 @@ +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP + + +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_entry_split[] -> MemRef_begin[0] }; +; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_for_body[i0] -> MemRef_end[0] }; +; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } +; SCOP-NEXT: } + + +; Check that the kernel launch is generated in the host IR. +; This declaration would not have been generated unless a kernel launch exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + +; void f(int *begin, int *end, int *arr) { +; for (int i = *begin; i < *end; i++) { +; arr[i] = 0; +; } +; } +; + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" + +define void @f(ptr %begin, ptr %end, ptr %arr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp1 = load i32, ptr %begin, align 4 + %tmp41 = load i32, ptr %end, align 4 + %cmp2 = icmp slt i32 %tmp1, %tmp41 + br i1 %cmp2, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.03 = phi i32 [ %tmp1, %for.body.lr.ph ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.03 + store i32 0, ptr %arrayidx, align 4 + %inc = add nsw i32 %i.03, 1 + %tmp4 = load i32, ptr %end, align 4 + %cmp = icmp slt i32 %inc, %tmp4 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll @@ -0,0 +1,56 @@ +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP + + +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; Check that we detect a scop with invariant accesses. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [beginval] -> { Stmt_entry_split[] -> MemRef_begin[0] }; +; SCOP-NEXT: Execution Context: [beginval] -> { : } +; SCOP-NEXT: } + +; Check that the kernel launch is generated in the host IR. +; This declaration would not have been generated unless a kernel launch exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + +; +; void f(int *begin, int *arr) { +; for (int i = *begin; i < 100; i++) { +; arr[i] = 0; +; } +; } + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" + +define void @f(ptr %begin, ptr %arr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %beginval = load i32, ptr %begin, align 4 + %cmp1 = icmp slt i32 %beginval, 100 + br i1 %cmp1, label %for.body, label %for.end + + + +for.body: ; preds = %for.body.lr.ph, %for.body + %ival = phi i32 [ %beginval, %entry.split ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %ival + store i32 0, ptr %arrayidx, align 4 + %inc = add nsw i32 %ival, 1 + %cmp = icmp slt i32 %ival, 99 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll @@ -0,0 +1,57 @@ +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; Check that we detect a scop with invariant accesses. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp2] -> { Stmt_for_body[i0] -> MemRef_idx[0] }; +; SCOP-NEXT: Execution Context: [tmp2] -> { : } +; SCOP-NEXT: } + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + +; Check if we generate GPU code for simple loop with variable upper bound. +; This always worked, but have this test to prevent regressions. +; void f(int *idx, int *arr) { +; for (int i = 0; i < *idx; i++) { +; arr[i] = 0; +; } +; } +; +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(ptr %idx, ptr %arr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp21 = load i32, ptr %idx, align 4 + %cmp2 = icmp sgt i32 %tmp21, 0 + br i1 %cmp2, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv + store i32 0, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %tmp2 = load i32, ptr %idx, align 4 + %0 = sext i32 %tmp2 to i64 + %cmp = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} diff --git a/polly/test/GPGPU/invariant-load-hoisting.ll b/polly/test/GPGPU/invariant-load-hoisting.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-hoisting.ll @@ -0,0 +1,116 @@ +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP +; +; RUN: opt %loadPolly -polly-scops -S -polly-invariant-load-hoisting \ +; RUN: -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR +; +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=KERNEL-IR +; +; REQUIRES: pollyacc +; +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end26 +; SCOP-NEXT: Max Loop Depth: 3 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] }; +; SCOP-NEXT: Execution Context: [n, tmp12] -> { : n > 0 } +; SCOP-NEXT: } +; HOST-IR: call void @polly_launchKernel(ptr %[[REGC:[0-9]+]], i32 %{{[0-9]+}}, i32 1, i32 32, i32 1, i32 1, ptr %polly_launch_0_params_i8ptr) +; HOST-IR-NEXT: call void @polly_freeKernel(ptr %[[REGC]]) + +; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_B, ptr addrspace(1) %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge) + + +; Check that we generate correct GPU code in case of invariant load hoisting. +; +; +; static const int N = 3000; +; +; void f(int A[N][N], int *invariant, int B[N][N], int n) { +; for (int i = 0; i < n; i++) { +; for (int j = 0; j < n; j++) { +; for (int k = 0; k < n; k++) { +; +; A[*invariant][k] = B[k][k]; +; A[k][*invariant] += B[k][k]; +; } +; } +; } +; } +; + +define void @f(ptr %A, ptr %invariant, ptr %B, i32 %n) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.cond1.preheader.lr.ph, label %for.end26 + +for.cond1.preheader.lr.ph: ; preds = %entry.split + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc24 + %i.07 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc25, %for.inc24 ] + %cmp23 = icmp sgt i32 %n, 0 + br i1 %cmp23, label %for.cond4.preheader.lr.ph, label %for.inc24 + +for.cond4.preheader.lr.ph: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.lr.ph, %for.inc21 + %j.04 = phi i32 [ 0, %for.cond4.preheader.lr.ph ], [ %inc22, %for.inc21 ] + %cmp51 = icmp sgt i32 %n, 0 + br i1 %cmp51, label %for.body6.lr.ph, label %for.inc21 + +for.body6.lr.ph: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.lr.ph, %for.body6 + %k.02 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ] + %idxprom = sext i32 %k.02 to i64 + %idxprom7 = sext i32 %k.02 to i64 + %arrayidx8 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom, i64 %idxprom7 + %tmp9 = load i32, ptr %arrayidx8, align 4 + %tmp12 = load i32, ptr %invariant, align 4 + %idxprom9 = sext i32 %tmp12 to i64 + %idxprom11 = sext i32 %k.02 to i64 + %arrayidx12 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom9, i64 %idxprom11 + store i32 %tmp9, ptr %arrayidx12, align 4 + %idxprom13 = sext i32 %k.02 to i64 + %idxprom15 = sext i32 %k.02 to i64 + %arrayidx16 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom13, i64 %idxprom15 + %tmp17 = load i32, ptr %arrayidx16, align 4 + %idxprom17 = sext i32 %k.02 to i64 + %tmp21 = load i32, ptr %invariant, align 4 + %idxprom19 = sext i32 %tmp21 to i64 + %arrayidx20 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom17, i64 %idxprom19 + %tmp22 = load i32, ptr %arrayidx20, align 4 + %add = add nsw i32 %tmp22, %tmp17 + store i32 %add, ptr %arrayidx20, align 4 + %inc = add nuw nsw i32 %k.02, 1 + %cmp5 = icmp slt i32 %inc, %n + br i1 %cmp5, label %for.body6, label %for.cond4.for.inc21_crit_edge + +for.cond4.for.inc21_crit_edge: ; preds = %for.body6 + br label %for.inc21 + +for.inc21: ; preds = %for.cond4.for.inc21_crit_edge, %for.cond4.preheader + %inc22 = add nuw nsw i32 %j.04, 1 + %cmp2 = icmp slt i32 %inc22, %n + br i1 %cmp2, label %for.cond4.preheader, label %for.cond1.for.inc24_crit_edge + +for.cond1.for.inc24_crit_edge: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.cond1.for.inc24_crit_edge, %for.cond1.preheader + %inc25 = add nuw nsw i32 %i.07, 1 + %cmp = icmp slt i32 %inc25, %n + br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end26_crit_edge + +for.cond.for.end26_crit_edge: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.cond.for.end26_crit_edge, %entry.split + ret void +} diff --git a/polly/test/GPGPU/invariant-load-of-scalar.ll b/polly/test/GPGPU/invariant-load-of-scalar.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/invariant-load-of-scalar.ll @@ -0,0 +1,81 @@ +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck -check-prefix=SCOP %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ +; RUN: -S < %s | \ +; RUN: FileCheck -check-prefix=HOST-IR %s + + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ +; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ +; RUN: FileCheck -check-prefix=KERNEL-IR %s + +; REQUIRES: pollyacc + +; Check that we offload invariant loads of scalars correctly. + +; Check that invariant loads are present. +; SCOP: Function: checkPrivatization +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp, tmp2] -> { Stmt_entry_split[] -> MemRef_begin[0] }; +; SCOP-NEXT: Execution Context: [tmp, tmp2] -> { : } +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp, tmp2] -> { Stmt_for_body[i0] -> MemRef_end[0] }; +; SCOP-NEXT: Execution Context: [tmp, tmp2] -> { : } +; SCOP-NEXT: } +; + +; Check that we do not actually allocate arrays for %begin, %end, since they are +; invariant load hoisted. +; HOST-IR: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice +; HOST-IR-NOT: call ptr @polly_allocateMemoryForDevice + +; Check that we send the invariant loaded scalars as parameters to the +; kernel function. +; KERNEL-IR: define ptx_kernel void @FUNC_checkPrivatization_SCOP_0_KERNEL_0 +; KERNEL-IR-SAME: (ptr addrspace(1) %MemRef_A, i32 %tmp, +; KERNEL-IR-SAME: i32 %tmp2, i32 %polly.access.begin.load) + + +; void checkScalarPointerOffload(int A[], int *begin, int *end) { +; for(int i = *begin; i < *end; i++) { +; A[i] = 10; +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +define void @checkPrivatization(ptr %A, ptr %begin, ptr %end) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp = load i32, ptr %begin, align 4 + %tmp21 = load i32, ptr %end, align 4 + %cmp3 = icmp slt i32 %tmp, %tmp21 + br i1 %cmp3, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + %tmp1 = sext i32 %tmp to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv4 = phi i64 [ %tmp1, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv4 + store i32 10, ptr %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv4, 1 + %tmp2 = load i32, ptr %end, align 4 + %tmp3 = sext i32 %tmp2 to i64 + %cmp = icmp slt i64 %indvars.iv.next, %tmp3 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll @@ -0,0 +1,106 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ +; RUN: -S < %s | \ +; RUN: FileCheck -check-prefix=IR %s + +; REQUIRES: pollyacc +; +; void kernel_params_only_some_arrays(float A[], float B[]) { +; for (long i = 0; i < 32; i++) +; A[i] += 42; +; +; for (long i = 0; i < 32; i++) +; B[i] += 42; +; } + +; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0' +; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0" +; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" + +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B) +; KERNEL-NEXT: entry: +; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; KERNEL-NEXT: %b0 = zext i32 %0 to i64 +; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; KERNEL-NEXT: %t0 = zext i32 %1 to i64 + +; KERNEL: ret void +; KERNEL-NEXT: } + +; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1' +; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1" +; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" + +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A) +; KERNEL-NEXT: entry: +; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; KERNEL-NEXT: %b0 = zext i32 %0 to i64 +; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; KERNEL-NEXT: %t0 = zext i32 %1 to i64 + +; KERNEL: ret void +; KERNEL-NEXT: } + + +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) +; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 +; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* +; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] + +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) +; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 +; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 +; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* +; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @kernel_params_only_some_arrays(float* %A, float* %B) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] + %exitcond1 = icmp ne i64 %i.0, 32 + br i1 %exitcond1, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0 + %tmp = load float, float* %arrayidx, align 4 + %add = fadd float %tmp, 4.200000e+01 + store float %add, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nuw nsw i64 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + br label %for.cond2 + +for.cond2: ; preds = %for.inc7, %for.end + %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ] + %exitcond = icmp ne i64 %i1.0, 32 + br i1 %exitcond, label %for.body4, label %for.end9 + +for.body4: ; preds = %for.cond2 + %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0 + %tmp2 = load float, float* %arrayidx5, align 4 + %add6 = fadd float %tmp2, 4.200000e+01 + store float %add6, float* %arrayidx5, align 4 + br label %for.inc7 + +for.inc7: ; preds = %for.body4 + %inc8 = add nuw nsw i64 %i1.0, 1 + br label %for.cond2 + +for.end9: ; preds = %for.cond2 + ret void +} diff --git a/polly/test/GPGPU/kernel-params-scop-parameter.ll b/polly/test/GPGPU/kernel-params-scop-parameter.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/kernel-params-scop-parameter.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=KERNEL-IR %s + +; REQUIRES: pollyacc + +; void kernel_params_scop_parameter(float A[], long n) { +; for (long i = 0; i < n; i++) +; A[i] += 42; +; } + +; KERNEL-IR: define ptx_kernel void @FUNC_kernel_params_scop_parameter_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @kernel_params_scop_parameter(ptr %A, i64 %n) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] + %tmp = icmp slt i64 %i.0, %n + br i1 %tmp, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp3 = getelementptr inbounds float, ptr %A, i64 %i.0 + %tmp4 = load float, ptr %tmp3, align 4 + %tmp5 = fadd float %tmp4, 4.200000e+01 + store float %tmp5, ptr %tmp3, align 4 + br label %bb6 + +bb6: ; preds = %bb2 + %tmp7 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll @@ -0,0 +1,124 @@ +; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-kernel-ir -disable-output < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; REQUIRES: pollyacc + +; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { +; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_1_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { +; KERNEL: define ptx_kernel void @FUNC_foo2_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Function Attrs: nounwind uwtable +define void @foo(i32 %arg, ptr %arg1) #0 { +bb: + br label %bb2 + +bb2: ; preds = %bb + %tmp = icmp sgt i32 %arg, 0 + br i1 %tmp, label %bb3, label %bb13 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb4, %bb3 + %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ] + %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5 + %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2 + %tmp8 = add nsw i32 %tmp7, 1 + store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2 + %tmp9 = add nuw nsw i64 %tmp5, 1 + %tmp10 = zext i32 %arg to i64 + %tmp11 = icmp ne i64 %tmp9, %tmp10 + br i1 %tmp11, label %bb4, label %bb12 + +bb12: ; preds = %bb4 + br label %bb13 + +bb13: ; preds = %bb12, %bb2 + %tmp14 = tail call i64 @clock() #3 + %tmp15 = icmp eq i64 %tmp14, 0 + br i1 %tmp15, label %bb16, label %bb29 + +bb16: ; preds = %bb13 + %tmp17 = icmp sgt i32 %arg, 0 + br i1 %tmp17, label %bb18, label %bb28 + +bb18: ; preds = %bb16 + br label %bb19 + +bb19: ; preds = %bb19, %bb18 + %tmp20 = phi i64 [ 0, %bb18 ], [ %tmp24, %bb19 ] + %tmp21 = getelementptr inbounds i32, ptr %arg1, i64 %tmp20 + %tmp22 = load i32, ptr %tmp21, align 4, !tbaa !2 + %tmp23 = add nsw i32 %tmp22, 1 + store i32 %tmp23, ptr %tmp21, align 4, !tbaa !2 + %tmp24 = add nuw nsw i64 %tmp20, 1 + %tmp25 = zext i32 %arg to i64 + %tmp26 = icmp ne i64 %tmp24, %tmp25 + br i1 %tmp26, label %bb19, label %bb27 + +bb27: ; preds = %bb19 + br label %bb28 + +bb28: ; preds = %bb27, %bb16 + br label %bb29 + +bb29: ; preds = %bb28, %bb13 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1 + +; Function Attrs: nounwind +declare i64 @clock() #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define void @foo2(i32 %arg, ptr %arg1) #0 { +bb: + br label %bb2 + +bb2: ; preds = %bb + %tmp = icmp sgt i32 %arg, 0 + br i1 %tmp, label %bb3, label %bb13 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb4, %bb3 + %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ] + %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5 + %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2 + %tmp8 = add nsw i32 %tmp7, 1 + store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2 + %tmp9 = add nuw nsw i64 %tmp5, 1 + %tmp10 = zext i32 %arg to i64 + %tmp11 = icmp ne i64 %tmp9, %tmp10 + br i1 %tmp11, label %bb4, label %bb12 + +bb12: ; preds = %bb4 + br label %bb13 + +bb13: ; preds = %bb12, %bb2 + ret void +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll b/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll @@ -0,0 +1,89 @@ +; RUN: opt %loadPolly -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s \ +; RUN: -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll \ +; RUN: | FileCheck %s --check-prefix=HOST-IR + +; Test that we do recognise and codegen a kernel that has functions that can +; be mapped to NVIDIA's libdevice + +; REQUIRES: pollyacc + +; Check that we model the kernel as a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end + +; Check that the intrinsic call is present in the kernel IR. +; KERNEL-IR: %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_) +; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf) +; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf) + +; Powi and exp cannot be lowered directly. Rather, we expect them to be +; lowered by libdevice. +; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2) +; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi) + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + + +; void f(float *A, float *B, int N) { +; for(int i = 0; i < N; i++) { +; float tmp0 = A[i]; +; float expf = expf(tmp1); +; cosf = cosf(expf); +; logf = logf(cosf); +; powi = powi(logf, 2); +; exp = exp(powi); +; B[i] = logf; +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(ptr %A, ptr %B, i32 %N) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %A.arr.i = getelementptr inbounds float, ptr %A, i64 %indvars.iv + %A.arr.i.val = load float, ptr %A.arr.i, align 4 + ; Call to intrinsics that should be part of the kernel. + %expf = tail call float @expf(float %A.arr.i.val) + %cosf = tail call float @cosf(float %expf) + %logf = tail call float @logf(float %cosf) + %powi = tail call float @llvm.powi.f32.i32(float %logf, i32 2) + %exp = tail call float @llvm.exp.f32(float %powi) + %B.arr.i = getelementptr inbounds float, ptr %B, i64 %indvars.iv + store float %exp, ptr %B.arr.i, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %N to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + +; Function Attrs: nounwind readnone +declare float @expf(float) #0 +declare float @cosf(float) #0 +declare float @logf(float) #0 +declare float @llvm.powi.f32.i32(float, i32) #0 +declare float @llvm.exp.f32(float) #0 + +attributes #0 = { nounwind readnone } + diff --git a/polly/test/GPGPU/live-range-reordering-with-privatization.ll b/polly/test/GPGPU/live-range-reordering-with-privatization.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/live-range-reordering-with-privatization.ll @@ -0,0 +1,78 @@ + ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ +; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-code -disable-output \ +; RUN: < %s | FileCheck %s -check-prefix=CODE + +; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ +; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-kernel-ir -disable-output \ +; RUN: < %s | FileCheck %s -check-prefix=KERNELIR + +; REQUIRES: pollyacc + +; void f(const int *end, int *arr, const int *control, const int *readarr) { +; for (int i = 0; i < *end; i++) { +; int t = 0; +; if (*control > 3) { +; t += readarr[i]; +; } +; arr[i] = t; +; } +; } + +; This test case tests the ability to infer that `t` is local to each loop +; iteration, and can therefore be privatized. + +; CODE: # kernel0 +; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1) +; CODE-NEXT: if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) { +; CODE-NEXT: Stmt_for_body_last(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: if (tmp1 >= 4) +; CODE-NEXT: Stmt_if_then(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: Stmt_if_end(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: } + +; KERNELIR: %private_array = alloca i32 + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.12.0" + +define void @f(ptr %end, ptr %arr, ptr %control, ptr %readarr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp3 = load i32, ptr %end, align 4 + %cmp4 = icmp sgt i32 %tmp3, 0 + br i1 %cmp4, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %if.end + %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ] + %tmp1 = load i32, ptr %control, align 4 + %cmp1 = icmp sgt i32 %tmp1, 3 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, ptr %readarr, i32 %i.05 + %tmp2 = load i32, ptr %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, ptr %arr, i32 %i.05 + store i32 %t.0, ptr %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.05, 1 + %tmp = load i32, ptr %end, align 4 + %cmp = icmp slt i32 %inc, %tmp + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %if.end + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + diff --git a/polly/test/GPGPU/loops-outside-scop.ll b/polly/test/GPGPU/loops-outside-scop.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/loops-outside-scop.ll @@ -0,0 +1,67 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP + +; There is no FileCheck because we want to make sure that this doesn't crash. +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ +; RUN: -disable-output < %s + +; REQUIRES: pollyacc + +; Due to the existence of the `fence` call, We can only detect the inner loop +; and not the outer loop. PPCGCodeGeneration had not implemented this case. +; The fix was to pull the implementation from `IslNodeBuilder. + +; Make sure that we only capture the inner loop +; SCOP: Function: f +; SCOP-NEXT: Region: %for2.body---%for2.body.fence +; SCOP-NEXT: Max Loop Depth: 1 + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +declare void @fn_to_fence(ptr %val) + +; void f(int *arr, bool shouldcont) { +; for(int i = 0; ; i++) { +; for(int j = 0; j < 10; j++) { +; arr[j] = i; +; } +; fence(arr); +; if (!shouldcont) break; +; } +; } + + +; Function Attrs: nounwind uwtable +define void @f(ptr %arr, i1 %shouldcont) #1 { +entry: + br label %for.init + +for.init: ; preds = %for.end, %entry.split + %i = phi i32 [ %i.next, %for.end ], [ 0, %entry ] + br label %for2.body + +for2.body: ; preds = %"65", %"64" + %j = phi i32 [ %j.next, %for2.body ], [ 0, %for.init ] + %j.sext = sext i32 %j to i64 + %arr.slot = getelementptr i32, ptr %arr, i64 %j.sext + store i32 %i, ptr %arr.slot, align 4 + %exitcond = icmp eq i32 %j, 10 + %j.next = add i32 %j, 1 + br i1 %exitcond, label %for2.body.fence, label %for2.body + +for2.body.fence: ; preds = %"65" + call void @fn_to_fence(ptr %arr) #2 + br i1 %shouldcont, label %for.end, label %exit +for.end: ; preds = %"69" + %i.next = add i32 %i, 1 + br label %for.init + +exit: ; preds = %"69" + ret void + +} + + +attributes #0 = { argmemonly nounwind } +attributes #1 = { nounwind uwtable } +attributes #2 = { nounwind } diff --git a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll b/polly/test/GPGPU/managed-memory-rewrite-alloca.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/managed-memory-rewrite-alloca.ll @@ -0,0 +1,60 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ +; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP: i32 MemRef_arr[*]; + +; Check that we generate a constructor call for @A.toptr +; HOST-IR-NOT: %arr = alloca [100 x i32] + +source_filename = "test.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + + +define void @f() { +entry: + %arr = alloca [100 x i32] + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [100 x i32], ptr %arr, i64 0, i64 %indvars.iv1 + store i32 42, ptr %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll @@ -0,0 +1,93 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -S -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR +; +; REQUIRES: pollyacc +; +; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and +; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory` +; pass, even inside `constantExpr`. This is necessary because a cookie cutter +; Inst->replaceUsesOfWith(...) call does not actually work, because this does +; not replace the instruction within a ConstantExpr. +; +; #include +; +; static const int N = 100; +; int* f(int *ToFree) { +; free(ToFree); +; int *A = (int *)malloc(sizeof(int) * N); +; for(int i = 0; i < N; i++) { +; A[i] = 42; +; } +; return A; +; +; } + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; SCOP: Arrays { +; SCOP-NEXT: i32 MemRef_tmp[*]; // Element size 4 +; SCOP-NEXT: } + +; // Check that polly_mallocManaged is declared and used correctly. +; HOST-IR: declare ptr @polly_mallocManaged(i64) + +; // Check that polly_freeManaged is declared and used correctly. +; HOST-IR call void @polly_freeManaged(i8* %toFree) +; HOST-IR: declare void @polly_freeManaged(ptr) + +; // Check that we remove the original malloc,free +; HOST-IR-NOT: declare ptr @malloc(i64) +; HOST-IR-NOT: declare void @free(ptr) + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +define ptr @f(ptr %toFree) { +entry: + ; Free inside bitcast + call void @free (ptr %toFree) + br label %entry.split + +entry.split: ; preds = %entry + ; malloc inside bitcast. + %tmp = call ptr @malloc (i64 400) + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %tmp, i64 %indvars.iv1 + store i32 42, ptr %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret ptr %tmp +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 + +declare ptr @malloc(i64) +declare void @free(ptr) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll @@ -0,0 +1,91 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -S -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR +; +; REQUIRES: pollyacc +; +; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and +; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory` +; pass. +; +; #include +; +; static const int N = 100; +; int* f(int *ToFree) { +; free(ToFree); +; int *A = (int *)malloc(sizeof(int) * N); +; for(int i = 0; i < N; i++) { +; A[i] = 42; +; } +; return A; +; +; } + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; SCOP: Arrays { +; SCOP-NEXT: i32 MemRef_call[*]; // Element size 4 +; SCOP-NEXT: } + +; // Check that polly_mallocManaged is declared and used correctly. +; HOST-IR: %call = tail call ptr @polly_mallocManaged(i64 400) +; HOST-IR: declare ptr @polly_mallocManaged(i64) + +; // Check that polly_freeManaged is declared and used correctly. +; HOST-IR %toFreeBitcast = bitcast i32* %toFree to i8* +; HOST-IR call void @polly_freeManaged(i8* %toFreeBitcast) +; HOST-IR: declare void @polly_freeManaged(ptr) + +; // Check that we remove the original malloc,free +; HOST-IR-NOT: declare ptr @malloc(i64) +; HOST-IR-NOT: declare void @free(ptr) + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +define ptr @f(ptr %toFree) { +entry: + call void @free(ptr %toFree) + br label %entry.split + +entry.split: ; preds = %entry + %call = tail call ptr @malloc(i64 400) + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv1 + store i32 42, ptr %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret ptr %call +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 + +declare ptr @malloc(i64) +declare void @free(ptr) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/memory-only-referenced-from-access.ll b/polly/test/GPGPU/memory-only-referenced-from-access.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/memory-only-referenced-from-access.ll @@ -0,0 +1,44 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -polly-invariant-load-hoisting -polly-ignore-aliasing \ +; RUN: -polly-process-unprofitable -polly-ignore-parameter-bounds \ +; RUN: -polly-acc-fail-on-verify-module-failure \ +; RUN: -polly-acc-codegen-managed-memory \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s + +; REQUIRES: pollyacc + +; Verify that we correctly generate a kernel even if certain invariant load +; hoisted parameters appear only in memory accesses, but not domain elements. + +; CHECK: @FUNC_quux_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_tmp4, i32 %tmp3, i32 %tmp, i32 %tmp31, i32 %tmp2) + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +%struct.hoge = type { ptr, i64, i64, [1 x %struct.widget] } +%struct.widget = type { i64, i64, i64 } + +@global = external unnamed_addr global %struct.hoge, align 32 + +define void @quux(ptr noalias %arg, ptr noalias %arg1) { +bb: + %tmp = load i32, ptr %arg, align 4 + %tmp2 = sext i32 %tmp to i64 + %tmp3 = load i32, ptr %arg1, align 4 + %tmp4 = load ptr, ptr @global, align 32 + br label %bb5 + +bb5: ; preds = %bb5, %bb + %tmp6 = phi i32 [ %tmp11, %bb5 ], [ 0, %bb ] + %tmp7 = sext i32 %tmp6 to i64 + %tmp8 = sub nsw i64 %tmp7, %tmp2 + %tmp9 = getelementptr [0 x double], ptr %tmp4, i64 0, i64 %tmp8 + store double undef, ptr %tmp9, align 8 + %tmp10 = icmp eq i32 %tmp6, %tmp3 + %tmp11 = add i32 %tmp6, 1 + br i1 %tmp10, label %bb12, label %bb5 + +bb12: ; preds = %bb5 + ret void +} diff --git a/polly/test/GPGPU/mostly-sequential.ll b/polly/test/GPGPU/mostly-sequential.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/mostly-sequential.ll @@ -0,0 +1,105 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; REQUIRES: pollyacc + +; void foo(float A[]) { +; for (long i = 0; i < 128; i++) +; A[i] += i; +; +; for (long i = 0; i < 128; i++) +; for (long j = 0; j < 128; j++) +; A[42] += i + j; +; } + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(4); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k1_dimBlock; +; CODE-NEXT: dim3 k1_dimGrid; +; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (128) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb4(32 * b0 + t0); + +; CODE: # kernel1 +; CODE-NEXT: for (int c0 = 0; c0 <= 127; c0 += 1) +; CODE-NEXT: for (int c1 = 0; c1 <= 127; c1 += 1) +; CODE-NEXT: Stmt_bb14(c0, c1); + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A) { +bb: + br label %bb3 + +bb3: ; preds = %bb8, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ] + %exitcond2 = icmp ne i64 %i.0, 128 + br i1 %exitcond2, label %bb4, label %bb10 + +bb4: ; preds = %bb3 + %tmp = sitofp i64 %i.0 to float + %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0 + %tmp6 = load float, ptr %tmp5, align 4 + %tmp7 = fadd float %tmp6, %tmp + store float %tmp7, ptr %tmp5, align 4 + br label %bb8 + +bb8: ; preds = %bb4 + %tmp9 = add nuw nsw i64 %i.0, 1 + br label %bb3 + +bb10: ; preds = %bb3 + br label %bb11 + +bb11: ; preds = %bb23, %bb10 + %i1.0 = phi i64 [ 0, %bb10 ], [ %tmp24, %bb23 ] + %exitcond1 = icmp ne i64 %i1.0, 128 + br i1 %exitcond1, label %bb12, label %bb25 + +bb12: ; preds = %bb11 + br label %bb13 + +bb13: ; preds = %bb20, %bb12 + %j.0 = phi i64 [ 0, %bb12 ], [ %tmp21, %bb20 ] + %exitcond = icmp ne i64 %j.0, 128 + br i1 %exitcond, label %bb14, label %bb22 + +bb14: ; preds = %bb13 + %tmp15 = add nuw nsw i64 %i1.0, %j.0 + %tmp16 = sitofp i64 %tmp15 to float + %tmp17 = getelementptr inbounds float, ptr %A, i64 42 + %tmp18 = load float, ptr %tmp17, align 4 + %tmp19 = fadd float %tmp18, %tmp16 + store float %tmp19, ptr %tmp17, align 4 + br label %bb20 + +bb20: ; preds = %bb14 + %tmp21 = add nuw nsw i64 %j.0, 1 + br label %bb13 + +bb22: ; preds = %bb13 + br label %bb23 + +bb23: ; preds = %bb22 + %tmp24 = add nuw nsw i64 %i1.0, 1 + br label %bb11 + +bb25: ; preds = %bb11 + ret void +} diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/non-read-only-scalars.ll @@ -0,0 +1,168 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-IR +; +; REQUIRES: pollyacc +; +; #include +; +; float foo(float A[]) { +; float sum = 0; +; +; for (long i = 0; i < 32; i++) +; A[i] = i; +; +; for (long i = 0; i < 32; i++) +; A[i] += i; +; +; for (long i = 0; i < 32; i++) +; sum += A[i]; +; +; return sum; +; } +; +; int main() { +; float A[32]; +; float sum = foo(A); +; printf("%f\n", sum); +; } + +; CODE: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(1); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k1_dimBlock; +; CODE-NEXT: dim3 k1_dimGrid; +; CODE-NEXT: kernel1 <<>> (dev_MemRef_sum_0__phi); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k2_dimBlock; +; CODE-NEXT: dim3 k2_dimGrid; +; CODE-NEXT: kernel2 <<>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0__phi)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: { +; CODE-NEXT: Stmt_bb4(t0); +; CODE-NEXT: Stmt_bb10(t0); +; CODE-NEXT: } + +; CODE: # kernel1 +; CODE-NEXT: Stmt_bb17(); + +; CODE: # kernel2 +; TODO-NEXT: { +; TODO-NEXT: read(); +; TODO-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { +; TODO-NEXT: Stmt_bb18(c0); +; TODO-NEXT: if (c0 <= 31) +; TODO-NEXT: Stmt_bb20(c0); +; TODO-NEXT: } +; TODO-NEXT: write(); +; TODO-NEXT: } + + +; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(ptr addrspace(1) %MemRef_sum_0__phi) +; KERNEL-IR: store float 0.000000e+00, ptr %sum.0.phiops +; KERNEL-IR: [[REGA:%.+]] = addrspacecast ptr addrspace(1) %MemRef_sum_0__phi to ptr +; KERNEL-IR: [[REGB:%.+]] = load float, ptr %sum.0.phiops +; KERNEL-IR: store float [[REGB]], ptr [[REGA]] + +; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2(ptr addrspace(1) %MemRef_A, ptr addrspace(1) %MemRef_sum_0__phi, ptr addrspace(1) %MemRef_sum_0) + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1 + +define float @foo(ptr %A) { +bb: + br label %bb3 + +bb3: ; preds = %bb6, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] + %exitcond2 = icmp ne i64 %i.0, 32 + br i1 %exitcond2, label %bb4, label %bb8 + +bb4: ; preds = %bb3 + %tmp = sitofp i64 %i.0 to float + %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0 + store float %tmp, ptr %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb4 + %tmp7 = add nuw nsw i64 %i.0, 1 + br label %bb3 + +bb8: ; preds = %bb3 + br label %bb9 + +bb9: ; preds = %bb15, %bb8 + %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ] + %exitcond1 = icmp ne i64 %i1.0, 32 + br i1 %exitcond1, label %bb10, label %bb17 + +bb10: ; preds = %bb9 + %tmp11 = sitofp i64 %i1.0 to float + %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0 + %tmp13 = load float, ptr %tmp12, align 4 + %tmp14 = fadd float %tmp13, %tmp11 + store float %tmp14, ptr %tmp12, align 4 + br label %bb15 + +bb15: ; preds = %bb10 + %tmp16 = add nuw nsw i64 %i1.0, 1 + br label %bb9 + +bb17: ; preds = %bb9 + br label %bb18 + +bb18: ; preds = %bb20, %bb17 + %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ] + %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ] + %exitcond = icmp ne i64 %i2.0, 32 + br i1 %exitcond, label %bb19, label %bb25 + +bb19: ; preds = %bb18 + br label %bb20 + +bb20: ; preds = %bb19 + %tmp21 = getelementptr inbounds float, ptr %A, i64 %i2.0 + %tmp22 = load float, ptr %tmp21, align 4 + %tmp23 = fadd float %sum.0, %tmp22 + %tmp24 = add nuw nsw i64 %i2.0, 1 + br label %bb18 + +bb25: ; preds = %bb18 + %sum.0.lcssa = phi float [ %sum.0, %bb18 ] + ret float %sum.0.lcssa +} + +define i32 @main() { +bb: + %A = alloca [32 x float], align 16 + %tmp1 = call float @foo(ptr %A) + %tmp2 = fpext float %tmp1 to double + %tmp3 = call i32 (ptr, ...) @printf(ptr @.str, double %tmp2) #2 + ret i32 0 +} + +declare i32 @printf(ptr, ...) #1 + diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/non-zero-array-offset.ll @@ -0,0 +1,116 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR +; +; REQUIRES: pollyacc + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice)); + +; CODE: dim3 k0_dimBlock(8); +; CODE-NEXT: dim3 k0_dimGrid(1); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_B); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k1_dimBlock(8); +; CODE-NEXT: dim3 k1_dimGrid(1); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb3(t0); + +; CODE: # kernel1 +; CODE-NEXT: Stmt_bb11(t0); + +; IR: %p_dev_array_MemRef_B = call ptr @polly_allocateMemoryForDevice(i64 32) +; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 32) +; IR-NEXT: [[REG0:%.+]] = getelementptr float, ptr %B, i64 8 +; IR-NEXT: call void @polly_copyFromHostToDevice(ptr [[REG0]], ptr %p_dev_array_MemRef_B, i64 32) + +; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_B) +; IR-NEXT: [[REGC:%.+]] = getelementptr float, ptr [[REGA]], i64 -8 + +; void foo(float A[], float B[]) { +; for (long i = 0; i < 8; i++) +; B[i + 8] *= 4; +; +; for (long i = 0; i < 8; i++) +; A[i] *= 12; +; } +; +; #ifdef OUTPUT +; int main() { +; float A[16]; +; +; for (long i = 0; i < 16; i++) { +; __sync_synchronize(); +; A[i] = i; +; } +; +; foo(A, A); +; +; float sum = 0; +; for (long i = 0; i < 16; i++) { +; __sync_synchronize(); +; sum += A[i]; +; } +; +; printf("%f\n", sum); +; } +; #endif +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A, ptr %B) { +bb: + br label %bb2 + +bb2: ; preds = %bb7, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ] + %exitcond1 = icmp ne i64 %i.0, 8 + br i1 %exitcond1, label %bb3, label %bb9 + +bb3: ; preds = %bb2 + %tmp = add nuw nsw i64 %i.0, 8 + %tmp4 = getelementptr inbounds float, ptr %B, i64 %tmp + %tmp5 = load float, ptr %tmp4, align 4 + %tmp6 = fmul float %tmp5, 4.000000e+00 + store float %tmp6, ptr %tmp4, align 4 + br label %bb7 + +bb7: ; preds = %bb3 + %tmp8 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb9: ; preds = %bb2 + br label %bb10 + +bb10: ; preds = %bb15, %bb9 + %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ] + %exitcond = icmp ne i64 %i1.0, 8 + br i1 %exitcond, label %bb11, label %bb17 + +bb11: ; preds = %bb10 + %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0 + %tmp13 = load float, ptr %tmp12, align 4 + %tmp14 = fmul float %tmp13, 1.200000e+01 + store float %tmp14, ptr %tmp12, align 4 + br label %bb15 + +bb15: ; preds = %bb11 + %tmp16 = add nuw nsw i64 %i1.0, 1 + br label %bb10 + +bb17: ; preds = %bb10 + ret void +} diff --git a/polly/test/GPGPU/only-part-of-array-modified.ll b/polly/test/GPGPU/only-part-of-array-modified.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/only-part-of-array-modified.ll @@ -0,0 +1,40 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s +; +; REQUIRES: pollyacc +; +; void foo(float A[], float B[]) { +; for (long i = 0; i < 1024; i++) +; A[2 * i] = B[i]; +; } + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i32), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2047) * sizeof(i32), cudaMemcpyHostToDevice)); + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A, ptr %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb8, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb10 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds float, ptr %B, i64 %i.0 + %tmp4 = load i32, ptr %tmp, align 4 + %tmp5 = shl nsw i64 %i.0, 1 + %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5 + store i32 %tmp4, ptr %tmp6, align 4 + br label %bb8 + +bb8: ; preds = %bb2 + %tmp9 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb10: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/parametric-loop-bound.ll b/polly/test/GPGPU/parametric-loop-bound.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/parametric-loop-bound.ll @@ -0,0 +1,62 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -S < %s | \ +; RUN: FileCheck -check-prefix=IR %s + +; REQUIRES: pollyacc + +; void foo(long A[], long n) { +; for (long i = 0; i < n; i++) +; A[i] += 100; +; } + +; CODE: if (n >= 1) { +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (n) * sizeof(i64), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(n >= 1048545 ? 32768 : (n + 31) / 32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, n); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (n) * sizeof(i64), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: for (int c0 = 0; c0 <= (n - 32 * b0 - 1) / 1048576; c0 += 1) +; CODE-NEXT: if (n >= 32 * b0 + t0 + 1048576 * c0 + 1) +; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0); + +; IR: store i64 %n, ptr %polly_launch_0_param_1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGA]] + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A, i64 %n) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] + %tmp = icmp slt i64 %i.0, %n + br i1 %tmp, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0 + %tmp4 = load i64, ptr %tmp3, align 8 + %tmp5 = add nsw i64 %tmp4, 100 + store i64 %tmp5, ptr %tmp3, align 8 + br label %bb6 + +bb6: ; preds = %bb2 + %tmp7 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/partial_writes.ll b/polly/test/GPGPU/partial_writes.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/partial_writes.ll @@ -0,0 +1,49 @@ +; RUN: opt %loadPolly -polly-import-jscop -polly-codegen-ppcg -polly-stmt-granularity=bb -S < %s \ +; RUN: | FileCheck %s + +; REQUIRES: pollyacc + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: polly_launchKernel + +; Function Attrs: nounwind uwtable +define void @partial_writes() { +bb: + %tmp = tail call ptr @wibble() #2 + br label %bb2 + +bb2: ; preds = %bb11, %bb + %tmp3 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] + %tmp4 = getelementptr inbounds [1200 x double], ptr %tmp, i64 0, i64 %tmp3 + %tmp5 = load double, ptr %tmp4, align 8, !tbaa !1 + br label %bb6 + +bb6: ; preds = %bb6, %bb2 + %tmp7 = phi double [ undef, %bb2 ], [ undef, %bb6 ] + %tmp8 = phi i64 [ 0, %bb2 ], [ %tmp9, %bb6 ] + store double undef, ptr %tmp4, align 8, !tbaa !1 + %tmp9 = add nuw nsw i64 %tmp8, 1 + %tmp10 = icmp eq i64 %tmp9, 900 + br i1 %tmp10, label %bb11, label %bb6 + +bb11: ; preds = %bb6 + %tmp12 = add nuw nsw i64 %tmp3, 1 + %tmp13 = icmp eq i64 %tmp12, 1200 + br i1 %tmp13, label %bb14, label %bb2 + +bb14: ; preds = %bb11 + ret void +} + +declare ptr @wibble() + + +!llvm.ident = !{!0} + +!0 = !{!"clang version 6.0.0 (trunk 309912) (llvm/trunk 309933)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"double", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop b/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop @@ -0,0 +1,47 @@ +{ + "arrays" : [ + { + "name" : "MemRef_tmp", + "sizes" : [ "*" ], + "type" : "double" + } + ], + "context" : "{ : }", + "name" : "%bb2---%bb14", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }" + } + ], + "domain" : "{ Stmt_bb2[i0] : 0 <= i0 <= 1199 }", + "name" : "Stmt_bb2", + "schedule" : "{ Stmt_bb2[i0] -> [i0, 0, 0] }" + }, + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] : i1 <= 898 }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }" + } + ], + "domain" : "{ Stmt_bb6[i0, i1] : 0 <= i0 <= 1199 and 0 <= i1 <= 899 }", + "name" : "Stmt_bb6", + "schedule" : "{ Stmt_bb6[i0, i1] -> [i0, 1, i1] }" + } + ] +} diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll @@ -0,0 +1,86 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-IR + +; REQUIRES: pollyacc + +; Approximate C source: +; void kernel_dynprog(int c[50]) { +; int iter = 0; +; int outl = 0; +; +; while(1) { +; for(int indvar = 1 ; indvar <= 49; indvar++) { +; c[indvar] = undef; +; } +; add78 = c[49] + outl; +; inc80 = iter + 1; +; +; if (true) break; +; +; outl = add78; +; iter = inc80; +; } +;} +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CODE: cudaCheckReturn(cudaMalloc((void **) &dev_MemRef_c, (50) * sizeof(i32))); + +; CODE: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(2); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_c); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_c)); + +; CODE: # kernel0 +; CODE-NEXT: if (32 * b0 + t0 <= 48) +; CODE-NEXT: Stmt_for_body17(0, 32 * b0 + t0); + +; IR-LABEL: call void @polly_freeKernel +; IR: [[REGC:%.+]] = bitcast i32* %{{[0-9]+}} to i8* +; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196) + +; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c) #0 { +; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %9 +; KERNEL-IR-NEXT: store i32 422, i32 addrspace(1)* %polly.access.MemRef_c, align 4 + +define void @kernel_dynprog([50 x i32]* %c) { +entry: + %arrayidx77 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 49 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond15.for.cond12.loopexit_crit_edge, %entry + %out_l.055 = phi i32 [ 0, %entry ], [ %add78, %for.cond15.for.cond12.loopexit_crit_edge ] + %iter.054 = phi i32 [ 0, %entry ], [ %inc80, %for.cond15.for.cond12.loopexit_crit_edge ] + br label %for.body17 + +for.cond15.for.cond12.loopexit_crit_edge: ; preds = %for.body17 + %tmp = load i32, i32* %arrayidx77, align 4 + %add78 = add nsw i32 %tmp, %out_l.055 + %inc80 = add nuw nsw i32 %iter.054, 1 + br i1 false, label %for.cond1.preheader, label %for.end81 + +for.body17: ; preds = %for.body17, %for.cond1.preheader + %indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ] + %arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71 + store i32 422, i32* %arrayidx69, align 4 + %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 + %lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32 + %exitcond75 = icmp ne i32 %lftr.wideiv74, 50 + br i1 %exitcond75, label %for.body17, label %for.cond15.for.cond12.loopexit_crit_edge + +for.end81: ; preds = %for.cond15.for.cond12.loopexit_crit_edge + ret void +} diff --git a/polly/test/GPGPU/private-memory.ll b/polly/test/GPGPU/private-memory.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/private-memory.ll @@ -0,0 +1,82 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -polly-acc-use-private \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ +; RUN: -polly-acc-use-private \ +; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; REQUIRES: pollyacc + +; void add(float *A) { +; for (long i = 0; i < 32; i++) +; for (long j = 0; j < 10; j++) +; A[i] += 1; +; } + +; CODE: # kernel0 +; CODE: { +; CODE: read(t0); +; CODE: for (int c3 = 0; c3 <= 9; c3 += 1) +; CODE: Stmt_bb5(t0, c3); +; CODE: write(t0); +; CODE: } + +; KERNEL: %private_array = alloca [1 x float] + +; KERNEL: %polly.access.cast.private_array = bitcast [1 x float]* %private_array to float* +; KERNEL-NEXT: %polly.access.private_array = getelementptr float, float* %polly.access.cast.private_array, i64 0 +; KERNEL-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0 +; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A +; KERNEL-NEXT: store float %shared.read, float* %polly.access.private_array + +; KERNEL: %polly.access.cast.private_array5 = bitcast [1 x float]* %private_array to float* +; KERNEL-NEXT: %polly.access.private_array6 = getelementptr float, float* %polly.access.cast.private_array5, i64 0 +; KERNEL-NEXT: %polly.access.cast.MemRef_A7 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-NEXT: %polly.access.MemRef_A8 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A7, i64 %t0 +; KERNEL-NEXT: %shared.write = load float, float* %polly.access.private_array6 +; KERNEL-NEXT: store float %shared.write, float addrspace(1)* %polly.access.MemRef_A8 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @add(float* %A) { +bb: + br label %bb2 + +bb2: ; preds = %bb11, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] + %exitcond1 = icmp ne i64 %i.0, 32 + br i1 %exitcond1, label %bb3, label %bb13 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb8, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] + %exitcond = icmp ne i64 %j.0, 10 + br i1 %exitcond, label %bb5, label %bb10 + +bb5: ; preds = %bb4 + %tmp = getelementptr inbounds float, float* %A, i64 %i.0 + %tmp6 = load float, float* %tmp, align 4 + %tmp7 = fadd float %tmp6, 1.000000e+00 + store float %tmp7, float* %tmp, align 4 + br label %bb8 + +bb8: ; preds = %bb5 + %tmp9 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb10: ; preds = %bb4 + br label %bb11 + +bb11: ; preds = %bb10 + %tmp12 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb13: ; preds = %bb2 + ret void +} diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/privatization-simple.ll @@ -0,0 +1,58 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + +; void f(int A[], int B[], int control, int C[]) { +; int x; +; #pragma scop +; for(int i = 0; i < 1000; i ++) { +; x = 0; +; if(control) x = C[i]; +; B[i] = x * A[i]; +; +; } +; #pragma endscop +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(ptr %A, ptr %B, i32 %control, ptr %C) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] + %tobool = icmp eq i32 %control, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %tmp4 = load i32, ptr %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %tmp8 = load i32, ptr %arrayidx2, align 4 + %mul = mul nsw i32 %tmp8, %x.0 + %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + store i32 %mul, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %if.end + ret void +} diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/privatization.ll @@ -0,0 +1,62 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: checkPrivatization +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) + +; +; +; void checkPrivatization(int A[], int B[], int C[], int control) { +; int x; +; #pragma scop +; for (int i = 0; i < 1000; i++) { +; x = 0; +; if (control) +; x += C[i]; +; +; B[i] = x * A[i]; +; } +; #pragma endscop +; } +; +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @checkPrivatization(ptr %A, ptr %B, ptr %C, i32 %control) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] + %tobool = icmp eq i32 %control, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %tmp4 = load i32, ptr %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %tmp9 = load i32, ptr %arrayidx2, align 4 + %mul = mul nsw i32 %tmp9, %x.0 + %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + store i32 %mul, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %if.end + ret void +} diff --git a/polly/test/GPGPU/region-stmt.ll b/polly/test/GPGPU/region-stmt.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/region-stmt.ll @@ -0,0 +1,81 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (128) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(4); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, dev_MemRef_B); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (128) * sizeof(float), cudaMemcpyDeviceToHost)); + +; CODE: # kernel0 +; CODE-NEXT: Stmt_for_body__TO__if_end(32 * b0 + t0); + +; IR: @polly_initContext + +; KERNEL-IR: kernel_0 + +; REQUIRES: pollyacc + +; void foo(float A[], float B[]) { +; for (long i = 0; i < 128; i++) +; if (A[i] == 42) +; B[i] += 2 * i; +; else +; B[i] += 4 * i; +; } +; +source_filename = "/tmp/test.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(ptr %A, ptr %B) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] + %exitcond = icmp ne i64 %i.0, 128 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds float, ptr %A, i64 %i.0 + %tmp = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp oeq float %tmp, 4.200000e+01 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %mul = shl nsw i64 %i.0, 1 + %conv = sitofp i64 %mul to float + %arrayidx2 = getelementptr inbounds float, ptr %B, i64 %i.0 + %tmp1 = load float, ptr %arrayidx2, align 4 + %add = fadd float %tmp1, %conv + store float %add, ptr %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %for.body + %mul3 = shl nsw i64 %i.0, 2 + %conv4 = sitofp i64 %mul3 to float + %arrayidx5 = getelementptr inbounds float, ptr %B, i64 %i.0 + %tmp2 = load float, ptr %arrayidx5, align 4 + %add6 = fadd float %tmp2, %conv4 + store float %add6, ptr %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %for.inc + +for.inc: ; preds = %if.end + %inc = add nuw nsw i64 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-IR + +; REQUIRES: pollyacc + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4 +; KERNEL-IR-NEXT: br label %polly.merge + +define void @kernel_dynprog(ptr %sum_c) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry + br label %for.body3 + +for.cond1.loopexit: ; preds = %for.end + %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1 + %exitcond57 = icmp ne i64 %indvars.iv.next56, 49 + br i1 %exitcond57, label %for.body3, label %for.inc55 + +for.body3: ; preds = %for.cond1.loopexit, %for.cond1.preheader + %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ] + %indvars.iv48 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next49, %for.cond1.loopexit ] + %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1 + %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv48, i64 %indvars.iv55 + store i32 0, ptr %arrayidx10, align 4 + %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv48 + br label %for.end + +for.end: ; preds = %for.body3 + br label %for.cond1.loopexit + +for.inc55: ; preds = %for.cond1.loopexit + ret void +} diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll @@ -0,0 +1,62 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-IR + +; REQUIRES: pollyacc + +; Ensure that no dead instructions are emitted between the store and the +; branch instruction of the ScopStmt. At some point, our dead-code-elimination +; did not remove code that was inserted to compute the old (unused) branch +; condition. This code referred to CPU registers and consequently resulted +; in invalid bitcode. + +; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4 +; KERNEL-IR-NEXT: br label %polly.merge + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @kernel_dynprog(ptr %sum_c) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry + br label %for.body3 + +for.cond4.for.cond1.loopexit_crit_edge: ; preds = %for.end + br label %for.cond1.loopexit + +for.cond1.loopexit: ; preds = %for.cond4.for.cond1.loopexit_crit_edge + br i1 undef, label %for.body3, label %for.inc55 + +for.body3: ; preds = %for.cond1.loopexit, %for.cond1.preheader + %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ] + %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1 + br label %for.body6 + +for.body6: ; preds = %for.end, %for.body3 + %indvars.iv50 = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next51, %for.end ] + %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 %indvars.iv55 + store i32 0, ptr %arrayidx10, align 4 + %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv50 + br i1 %cmp1334, label %for.body14.lr.ph, label %for.end + +for.body14.lr.ph: ; preds = %for.body6 + br label %for.body14 + +for.body14: ; preds = %for.body14, %for.body14.lr.ph + %arrayidx32 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 0 + br i1 false, label %for.body14, label %for.cond12.for.end_crit_edge + +for.cond12.for.end_crit_edge: ; preds = %for.body14 + br label %for.end + +for.end: ; preds = %for.cond12.for.end_crit_edge, %for.body6 + %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1 + %lftr.wideiv53 = trunc i64 %indvars.iv.next51 to i32 + %exitcond54 = icmp ne i32 %lftr.wideiv53, 50 + br i1 %exitcond54, label %for.body6, label %for.cond4.for.cond1.loopexit_crit_edge + +for.inc55: ; preds = %for.cond1.loopexit + unreachable +} diff --git a/polly/test/GPGPU/run-time-check.ll b/polly/test/GPGPU/run-time-check.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/run-time-check.ll @@ -0,0 +1,58 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR +; +; REQUIRES: pollyacc +; +; void foo(long n, float A[][32]) { +; for (long i = 0; i < n; i++) +; for (long j = 0; j < n; j++) +; A[i][j] += A[i + 1][j + 1]; +; } + +; IR: %tmp = icmp slt i64 %i.0, %n +; IR-NEXT: br i1 %tmp, label %bb2, label %polly.merge_new_and_old + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(i64 %n, ptr %A) { +bb: + br label %bb1 + +bb1: ; preds = %bb15, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp16, %bb15 ] + %tmp = icmp slt i64 %i.0, %n + br i1 %tmp, label %bb2, label %bb17 + +bb2: ; preds = %bb1 + br label %bb3 + +bb3: ; preds = %bb12, %bb2 + %j.0 = phi i64 [ 0, %bb2 ], [ %tmp13, %bb12 ] + %exitcond = icmp ne i64 %j.0, %n + br i1 %exitcond, label %bb4, label %bb14 + +bb4: ; preds = %bb3 + %tmp5 = add nuw nsw i64 %j.0, 1 + %tmp6 = add nuw nsw i64 %i.0, 1 + %tmp7 = getelementptr inbounds [32 x float], ptr %A, i64 %tmp6, i64 %tmp5 + %tmp8 = load float, ptr %tmp7, align 4 + %tmp9 = getelementptr inbounds [32 x float], ptr %A, i64 %i.0, i64 %j.0 + %tmp10 = load float, ptr %tmp9, align 4 + %tmp11 = fadd float %tmp10, %tmp8 + store float %tmp11, ptr %tmp9, align 4 + br label %bb12 + +bb12: ; preds = %bb4 + %tmp13 = add nuw nsw i64 %j.0, 1 + br label %bb3 + +bb14: ; preds = %bb3 + br label %bb15 + +bb15: ; preds = %bb14 + %tmp16 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb17: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll b/polly/test/GPGPU/scalar-param-and-value-32-bit.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-param-and-value-32-bit.ll @@ -0,0 +1,41 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s + +; REQUIRES: pollyacc, target=nvptx{{.*}} +; +; void foo(float A[], int n) { +; for (long j = 0; j < n; j++) +; A[j + n] += 42; +; } + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i32 %n) + +define void @foo(ptr %A, i32 %n) { +bb: + br label %bb1 + +bb1: ; preds = %bb9, %bb + %j.0 = phi i64 [ 0, %bb ], [ %tmp10, %bb9 ] + %tmp = sext i32 %n to i64 + %tmp2 = icmp slt i64 %j.0, %tmp + br i1 %tmp2, label %bb3, label %bb11 + +bb3: ; preds = %bb1 + %tmp4 = sext i32 %n to i64 + %tmp5 = add nsw i64 %j.0, %tmp4 + %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5 + %tmp7 = load float, ptr %tmp6, align 4 + %tmp8 = fadd float %tmp7, 4.200000e+01 + store float %tmp8, ptr %tmp6, align 4 + br label %bb9 + +bb9: ; preds = %bb3 + %tmp10 = add nuw nsw i64 %j.0, 1 + br label %bb1 + +bb11: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-param-and-value-use.ll b/polly/test/GPGPU/scalar-param-and-value-use.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-param-and-value-use.ll @@ -0,0 +1,67 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=IR %s + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; void foo(long n, float A[][n]) { +; for (long i = 0; i < 32; i++) +; for (long j = 0; j < 32; j++) +; A[i][j] += A[i + 1][j + 1]; +; } + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; This test case failed at some point as %n was only available in this kernel +; when referenced through an isl_id in an isl ast expression, but not when +; it was referenced from a SCEV or instruction that not part of any loop +; bound. + +; IR: %polly.access.mul.MemRef_A = mul nsw i64 {{.*}}, %n + +define void @foo(i64 %n, ptr %A) { +bb: + br label %bb2 + +bb2: ; preds = %bb19, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp20, %bb19 ] + %exitcond1 = icmp ne i64 %i.0, 32 + br i1 %exitcond1, label %bb3, label %bb21 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb16, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp17, %bb16 ] + %exitcond = icmp ne i64 %j.0, 32 + br i1 %exitcond, label %bb5, label %bb18 + +bb5: ; preds = %bb4 + %tmp = add nuw nsw i64 %j.0, 1 + %tmp6 = add nuw nsw i64 %i.0, 1 + %tmp7 = mul nsw i64 %tmp6, %n + %tmp8 = getelementptr inbounds float, ptr %A, i64 %tmp7 + %tmp9 = getelementptr inbounds float, ptr %tmp8, i64 %tmp + %tmp10 = load float, ptr %tmp9, align 4 + %tmp11 = mul nsw i64 %i.0, %n + %tmp12 = getelementptr inbounds float, ptr %A, i64 %tmp11 + %tmp13 = getelementptr inbounds float, ptr %tmp12, i64 %j.0 + %tmp14 = load float, ptr %tmp13, align 4 + %tmp15 = fadd float %tmp14, %tmp10 + store float %tmp15, ptr %tmp13, align 4 + br label %bb16 + +bb16: ; preds = %bb5 + %tmp17 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb18: ; preds = %bb4 + br label %bb19 + +bb19: ; preds = %bb18 + %tmp20 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb21: ; preds = %bb2 + ret void +} diff --git a/polly/test/GPGPU/scalar-parameter-fp128.ll b/polly/test/GPGPU/scalar-parameter-fp128.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-fp128.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today with "LowerFormalArguments didn't emit the correct number of values!" + +; void foo(fp128 A[], fp128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @fp128(ptr %A, fp128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0 + %tmp3 = load fp128, ptr %tmp, align 4 + %tmp4 = fadd fp128 %tmp3, %b + store fp128 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter-half.ll b/polly/test/GPGPU/scalar-parameter-half.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-half.ll @@ -0,0 +1,35 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; void foo(half A[], half b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @half(ptr %A, half %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds half, ptr %A, i64 %i.0 + %tmp3 = load half, ptr %tmp, align 4 + %tmp4 = fadd half %tmp3, %b + store half %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter-i120.ll b/polly/test/GPGPU/scalar-parameter-i120.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i120.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" + +; void foo(i120 A[], i120 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i120(ptr %A, i120 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i120 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i120, ptr %A, i120 %i.0 + %tmp3 = load i120, ptr %tmp, align 4 + %tmp4 = add i120 %tmp3, %b + store i120 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i120 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter-i128.ll b/polly/test/GPGPU/scalar-parameter-i128.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i128.ll @@ -0,0 +1,34 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; void foo(i128 A[], i128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i128(ptr %A, i128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i128 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i128, ptr %A, i128 %i.0 + %tmp3 = load i128, ptr %tmp, align 4 + %tmp4 = add i128 %tmp3, %b + store i128 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i128 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-parameter-i3000.ll b/polly/test/GPGPU/scalar-parameter-i3000.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i3000.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" + +; void foo(i3000 A[], i3000 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i3000(ptr %A, i3000 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i3000 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i3000, ptr %A, i3000 %i.0 + %tmp3 = load i3000, ptr %tmp, align 4 + %tmp4 = add i3000 %tmp3, %b + store i3000 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i3000 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-parameter-i80.ll b/polly/test/GPGPU/scalar-parameter-i80.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i80.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" + +; void foo(i80 A[], i80 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i80(ptr %A, i80 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i80 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i80, ptr %A, i80 %i.0 + %tmp3 = load i80, ptr %tmp, align 4 + %tmp4 = add i80 %tmp3, %b + store i80 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i80 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today with "LowerFormalArguments didn't emit the correct number of values!" + +; void foo(fp128 A[], fp128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @ppc_fp128(ptr %A, ppc_fp128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds ppc_fp128, ptr %A, i64 %i.0 + %tmp3 = load ppc_fp128, ptr %tmp, align 4 + %tmp4 = fadd ppc_fp128 %tmp3, %b + store ppc_fp128 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today with "LowerFormalArguments didn't emit the correct number of values!" + +; void foo(fp128 A[], fp128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @fp128(ptr %A, fp128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0 + %tmp3 = load fp128, ptr %tmp, align 4 + %tmp4 = fadd fp128 %tmp3, %b + store fp128 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter.ll @@ -0,0 +1,411 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -S < %s | \ +; RUN: FileCheck -check-prefix=IR %s + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; XFAIL: * + +; REQUIRES: pollyacc, target=nvptx{{.*}} + +; This fails today due to extensive output differences from when the test was written. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, float %MemRef_b) + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, MemRef_b); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(float A[], float b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @float(ptr %A, float %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds float, ptr %A, i64 %i.0 + %tmp3 = load float, ptr %tmp, align 4 + %tmp4 = fadd float %tmp3, %b + store float %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + +; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, double %MemRef_b) +; KERNEL-NEXT: entry: +; KERNEL-NEXT: %b.s2a = alloca double +; KERNEL-NEXT: store double %MemRef_b, ptr %b.s2a + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, MemRef_b); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(double), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(double A[], double b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @double(ptr %A, double %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds double, ptr %A, i64 %i.0 + %tmp3 = load double, ptr %tmp, align 4 + %tmp4 = fadd double %tmp3, %b + store double %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i1), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(i1 A[], i1 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @i1(ptr %A, i1 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i1, ptr %A, i64 %i.0 + %tmp3 = load i1, ptr %tmp, align 4 + %tmp4 = add i1 %tmp3, %b + store i1 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(i3 A[], i3 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @i3(ptr %A, i3 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i3, ptr %A, i64 %i.0 + %tmp3 = load i3, ptr %tmp, align 4 + %tmp4 = add i3 %tmp3, %b + store i3 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i8), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i8), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(i8 A[], i32 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @i8(ptr %A, i8 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i8, ptr %A, i64 %i.0 + %tmp3 = load i8, ptr %tmp, align 4 + %tmp4 = add i8 %tmp3, %b + store i8 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + +; IR-LABEL: @i8 + +; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A) +; IR-NEXT: store ptr [[REGA:%.+]], ptr %polly_launch_0_param_0 +; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params +; IR-NEXT: store i8 %b, ptr %polly_launch_0_param_1 +; IR-NEXT: [[REGD:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGD]] + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i32), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(i32 A[], i32 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @i32(ptr %A, i32 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i32, ptr %A, i64 %i.0 + %tmp3 = load i32, ptr %tmp, align 4 + %tmp4 = add i32 %tmp3, %b + store i32 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i60), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i60), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(i60 A[], i60 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @i60(ptr %A, i60 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i60, ptr %A, i64 %i.0 + %tmp3 = load i60, ptr %tmp, align 4 + %tmp4 = add i60 %tmp3, %b + store i60 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb2(32 * b0 + t0); + +; void foo(i64 A[], i64 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +define void @i64(ptr %A, i64 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i64, ptr %A, i64 %i.0 + %tmp3 = load i64, ptr %tmp, align 4 + %tmp4 = add i64 %tmp3, %b + store i64 %tmp4, ptr %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll b/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll @@ -0,0 +1,65 @@ +; RUN: opt %loadPolly -polly-acc-dump-code -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-code -polly-stmt-granularity=bb \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE + +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting -polly-stmt-granularity=bb < %s \ +; RUN: | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: { Stmt_loop_a[i0] -> MemRef_p[0] }; +; SCOP-NEXT: Execution Context: { : } +; SCOP-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: { +; CODE-NEXT: if (32 * b0 + t0 <= 1025) { +; CODE-NEXT: Stmt_loop(32 * b0 + t0); +; CODE-NEXT: write(0); +; CODE-NEXT: } +; CODE-NEXT: sync0(); +; CODE-NEXT: } + +; Check that we generate a correct "always false" branch. +; HOST-IR: br i1 false, label %polly.start, label %loop.pre_entry_bb + +; This test case checks that we generate correct code if PPCGCodeGeneration +; decides a build is unsuccessful with invariant load hoisting enabled. +; +; There is a conditional branch which switches between the original code and +; the new code. We try to set this conditional branch to branch on false. +; However, invariant load hoisting changes the structure of the scop, so we +; need to change the way we *locate* this instruction. + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.12.0" + +define void @foo(ptr %A, ptr %p) { +entry: + br label %loop + +loop: + %indvar = phi i64 [0, %entry], [%indvar.next, %loop] + %indvar.next = add i64 %indvar, 1 + %invariant = load float, ptr %p + %ptr = getelementptr float, ptr %A, i64 %indvar + store float 42.0, ptr %ptr + %cmp = icmp sle i64 %indvar, 1024 + br i1 %cmp, label %loop, label %loop2 + +loop2: + %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2] + %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2] + %indvar2.next = add i64 %indvar2, 1 + store float %indvar2f, ptr %A + %cmp2 = icmp sle i64 %indvar2, 1024 + br i1 %cmp2, label %loop2, label %end + +end: + ret void +} diff --git a/polly/test/GPGPU/scheduler-timeout.ll b/polly/test/GPGPU/scheduler-timeout.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/scheduler-timeout.ll @@ -0,0 +1,174 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; REQUIRES: pollyacc + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test case took at some point forever to schedule, as the isl scheduler +; seems to have problems if domain constraints appear in the dependences +; provided to the scheduler. + +; /* D := alpha*A*B*C + beta*D */ +; for (i = 0; i < _PB_NI; i++) +; for (j = 0; j < _PB_NJ; j++) +; { +; tmp[i][j] = 0; +; for (k = 0; k < _PB_NK; ++k) +; tmp[i][j] += alpha * A[i][k] * B[k][j]; +; } +; for (i = 0; i < _PB_NI; i++) +; for (j = 0; j < _PB_NL; j++) +; { +; D[i][j] *= beta; +; for (k = 0; k < _PB_NJ; ++k) +; D[i][j] += tmp[i][k] * C[k][j]; +; } + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(16, 32); +; CODE-NEXT: dim3 k0_dimGrid(128, 128); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k1_dimBlock(16, 32); +; CODE-NEXT: dim3 k1_dimGrid(128, 128); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); + +; CODE: # kernel0 +; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) +; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { +; CODE-NEXT: if (c2 == 0) +; CODE-NEXT: Stmt_for_body6(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); +; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) +; CODE-NEXT: Stmt_for_body11(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); +; CODE-NEXT: } + +; CODE: # kernel1 +; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) +; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { +; CODE-NEXT: if (c2 == 0) +; CODE-NEXT: Stmt_for_body36(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); +; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) +; CODE-NEXT: Stmt_for_body44(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); +; CODE-NEXT: } + + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, ptr nocapture) #0 + +; Function Attrs: nounwind uwtable +define internal void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, float %alpha, float %beta, ptr %tmp, ptr %A, ptr %B, ptr %C, ptr %D) #1 { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %entry.split, %for.inc28 + %indvars.iv19 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next20, %for.inc28 ] + br label %for.body6 + +for.cond31.preheader: ; preds = %for.inc28 + br label %for.cond34.preheader + +for.body6: ; preds = %for.cond4.preheader, %for.inc25 + %indvars.iv16 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next17, %for.inc25 ] + %arrayidx8 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16 + store float 0.000000e+00, ptr %arrayidx8, align 4, !tbaa !1 + br label %for.body11 + +for.body11: ; preds = %for.body6, %for.body11 + %indvars.iv13 = phi i64 [ 0, %for.body6 ], [ %indvars.iv.next14, %for.body11 ] + %arrayidx15 = getelementptr inbounds [4096 x float], ptr %A, i64 %indvars.iv19, i64 %indvars.iv13 + %tmp22 = load float, ptr %arrayidx15, align 4, !tbaa !1 + %mul = fmul float %tmp22, %alpha + %arrayidx19 = getelementptr inbounds [4096 x float], ptr %B, i64 %indvars.iv13, i64 %indvars.iv16 + %tmp23 = load float, ptr %arrayidx19, align 4, !tbaa !1 + %mul20 = fmul float %mul, %tmp23 + %arrayidx24 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16 + %tmp24 = load float, ptr %arrayidx24, align 4, !tbaa !1 + %add = fadd float %tmp24, %mul20 + store float %add, ptr %arrayidx24, align 4, !tbaa !1 + %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 + %exitcond15 = icmp ne i64 %indvars.iv.next14, 4096 + br i1 %exitcond15, label %for.body11, label %for.inc25 + +for.inc25: ; preds = %for.body11 + %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 + %exitcond18 = icmp ne i64 %indvars.iv.next17, 4096 + br i1 %exitcond18, label %for.body6, label %for.inc28 + +for.inc28: ; preds = %for.inc25 + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond21 = icmp ne i64 %indvars.iv.next20, 4096 + br i1 %exitcond21, label %for.cond4.preheader, label %for.cond31.preheader + +for.cond34.preheader: ; preds = %for.cond31.preheader, %for.inc65 + %indvars.iv10 = phi i64 [ 0, %for.cond31.preheader ], [ %indvars.iv.next11, %for.inc65 ] + br label %for.body36 + +for.body36: ; preds = %for.cond34.preheader, %for.inc62 + %indvars.iv7 = phi i64 [ 0, %for.cond34.preheader ], [ %indvars.iv.next8, %for.inc62 ] + %arrayidx40 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7 + %tmp25 = load float, ptr %arrayidx40, align 4, !tbaa !1 + %mul41 = fmul float %tmp25, %beta + store float %mul41, ptr %arrayidx40, align 4, !tbaa !1 + br label %for.body44 + +for.body44: ; preds = %for.body36, %for.body44 + %indvars.iv = phi i64 [ 0, %for.body36 ], [ %indvars.iv.next, %for.body44 ] + %arrayidx48 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv10, i64 %indvars.iv + %tmp26 = load float, ptr %arrayidx48, align 4, !tbaa !1 + %arrayidx52 = getelementptr inbounds [4096 x float], ptr %C, i64 %indvars.iv, i64 %indvars.iv7 + %tmp27 = load float, ptr %arrayidx52, align 4, !tbaa !1 + %mul53 = fmul float %tmp26, %tmp27 + %arrayidx57 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7 + %tmp28 = load float, ptr %arrayidx57, align 4, !tbaa !1 + %add58 = fadd float %tmp28, %mul53 + store float %add58, ptr %arrayidx57, align 4, !tbaa !1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.body44, label %for.inc62 + +for.inc62: ; preds = %for.body44 + %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 + %exitcond9 = icmp ne i64 %indvars.iv.next8, 4096 + br i1 %exitcond9, label %for.body36, label %for.inc65 + +for.inc65: ; preds = %for.inc62 + %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 + %exitcond12 = icmp ne i64 %indvars.iv.next11, 4096 + br i1 %exitcond12, label %for.cond34.preheader, label %for.end67 + +for.end67: ; preds = %for.inc65 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, ptr nocapture) #0 + +attributes #0 = { argmemonly nounwind } +attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"float", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/shared-memory-scalar.ll b/polly/test/GPGPU/shared-memory-scalar.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/shared-memory-scalar.ll @@ -0,0 +1,65 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -polly-acc-use-shared \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; REQUIRES: pollyacc + +; void add(float *A, float alpha) { +; for (long i = 0; i < 32; i++) +; for (long j = 0; j < 10; j++) +; A[i] += alpha; +; } + +; CODE: read(t0); +; CODE-NEXT: sync0(); +; CODE-NEXT: for (int c3 = 0; c3 <= 9; c3 += 1) +; CODE-NEXT: Stmt_bb5(t0, c3); +; CODE-NEXT: sync1(); +; CODE-NEXT: write(t0); + +; This test case was intended to test code generation for scalars stored +; in shared memory. However, after properly marking the scalar as read-only +; the scalar is not stored any more in shared memory. We still leave this +; test case as documentation if we every forget to mark scalars as read-only. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @add(ptr %A, float %alpha) { +bb: + br label %bb2 + +bb2: ; preds = %bb11, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] + %exitcond1 = icmp ne i64 %i.0, 32 + br i1 %exitcond1, label %bb3, label %bb13 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb8, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] + %exitcond = icmp ne i64 %j.0, 10 + br i1 %exitcond, label %bb5, label %bb10 + +bb5: ; preds = %bb4 + %tmp = getelementptr inbounds float, ptr %A, i64 %i.0 + %tmp6 = load float, ptr %tmp, align 4 + %tmp7 = fadd float %tmp6, %alpha + store float %tmp7, ptr %tmp, align 4 + br label %bb8 + +bb8: ; preds = %bb5 + %tmp9 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb10: ; preds = %bb4 + br label %bb11 + +bb11: ; preds = %bb10 + %tmp12 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb13: ; preds = %bb2 + ret void +} diff --git a/polly/test/GPGPU/shared-memory-two-dimensional.ll b/polly/test/GPGPU/shared-memory-two-dimensional.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/shared-memory-two-dimensional.ll @@ -0,0 +1,103 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -polly-acc-use-shared \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ +; RUN: -polly-acc-use-shared \ +; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; REQUIRES: pollyacc + +; void foo(float A[], float b[][8]) { +; for (long i = 0; i < 32; i++) +; for (long j = 0; j < 16; j++) +; for (long k = 0; k < 8; k++) +; A[i] += j * k * b[j][k]; +; } + + +; CODE: # kernel0 +; CODE-NEXT: { +; CODE-NEXT: if (t0 <= 7) +; CODE-NEXT: for (int c0 = 0; c0 <= 15; c0 += 1) +; CODE-NEXT: read(c0, t0); +; CODE-NEXT: read(t0); +; CODE-NEXT: sync0(); +; CODE-NEXT: for (int c3 = 0; c3 <= 15; c3 += 1) +; CODE-NEXT: for (int c4 = 0; c4 <= 7; c4 += 1) +; CODE-NEXT: Stmt_bb8(t0, c3, c4); +; CODE-NEXT: sync1(); +; CODE-NEXT: write(t0); +; CODE-NEXT: } + +; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4 + +; KERNEL: %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8 +; KERNEL-NEXT: %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0 +; KERNEL-NEXT: %polly.access.MemRef_b = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b +; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_b +; KERNEL-NEXT: store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(float* %A, [8 x float]* %b) { +bb: + br label %bb3 + +bb3: ; preds = %bb22, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ] + %exitcond2 = icmp ne i64 %i.0, 32 + br i1 %exitcond2, label %bb4, label %bb24 + +bb4: ; preds = %bb3 + br label %bb5 + +bb5: ; preds = %bb19, %bb4 + %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ] + %exitcond1 = icmp ne i64 %j.0, 16 + br i1 %exitcond1, label %bb6, label %bb21 + +bb6: ; preds = %bb5 + br label %bb7 + +bb7: ; preds = %bb16, %bb6 + %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ] + %exitcond = icmp ne i64 %k.0, 8 + br i1 %exitcond, label %bb8, label %bb18 + +bb8: ; preds = %bb7 + %tmp = mul nuw nsw i64 %j.0, %k.0 + %tmp9 = sitofp i64 %tmp to float + %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0 + %tmp11 = load float, float* %tmp10, align 4 + %tmp12 = fmul float %tmp9, %tmp11 + %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0 + %tmp14 = load float, float* %tmp13, align 4 + %tmp15 = fadd float %tmp14, %tmp12 + store float %tmp15, float* %tmp13, align 4 + br label %bb16 + +bb16: ; preds = %bb8 + %tmp17 = add nuw nsw i64 %k.0, 1 + br label %bb7 + +bb18: ; preds = %bb7 + br label %bb19 + +bb19: ; preds = %bb18 + %tmp20 = add nuw nsw i64 %j.0, 1 + br label %bb5 + +bb21: ; preds = %bb5 + br label %bb22 + +bb22: ; preds = %bb21 + %tmp23 = add nuw nsw i64 %i.0, 1 + br label %bb3 + +bb24: ; preds = %bb3 + ret void +} diff --git a/polly/test/GPGPU/shared-memory.ll b/polly/test/GPGPU/shared-memory.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/shared-memory.ll @@ -0,0 +1,83 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -polly-acc-use-shared \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ +; RUN: -polly-acc-use-shared \ +; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; REQUIRES: pollyacc + +; void add(float *A) { +; for (long i = 0; i < 32; i++) +; for (long j = 0; j < 10; j++) +; A[i] += 1; +; } + +; CODE: # kernel0 +; CODE: { +; CODE: read(t0); +; CODE: sync0(); +; CODE: for (int c3 = 0; c3 <= 9; c3 += 1) +; CODE: Stmt_bb5(t0, c3); +; CODE: sync1(); +; CODE: write(t0); +; CODE: } + +; KERNEL: @shared_MemRef_A = internal addrspace(3) global [32 x float] zeroinitializer, align 4 + +; KERNEL: %polly.access.shared_MemRef_A = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0 +; KERNEL-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0 +; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A +; KERNEL-NEXT: store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_A + +; KERNEL: %polly.access.shared_MemRef_A3 = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0 +; KERNEL-NEXT: %polly.access.cast.MemRef_A4 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; KERNEL-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A4, i64 %t0 +; KERNEL-NEXT: %shared.write = load float, float addrspace(3)* %polly.access.shared_MemRef_A3 +; KERNEL-NEXT: store float %shared.write, float addrspace(1)* %polly.access.MemRef_A5 + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @add(float* %A) { +bb: + br label %bb2 + +bb2: ; preds = %bb11, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] + %exitcond1 = icmp ne i64 %i.0, 32 + br i1 %exitcond1, label %bb3, label %bb13 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb8, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] + %exitcond = icmp ne i64 %j.0, 10 + br i1 %exitcond, label %bb5, label %bb10 + +bb5: ; preds = %bb4 + %tmp = getelementptr inbounds float, float* %A, i64 %i.0 + %tmp6 = load float, float* %tmp, align 4 + %tmp7 = fadd float %tmp6, 1.000000e+00 + store float %tmp7, float* %tmp, align 4 + br label %bb8 + +bb8: ; preds = %bb5 + %tmp9 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb10: ; preds = %bb4 + br label %bb11 + +bb11: ; preds = %bb10 + %tmp12 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb13: ; preds = %bb2 + ret void +} diff --git a/polly/test/GPGPU/simple-managed-memory-rewrite.ll b/polly/test/GPGPU/simple-managed-memory-rewrite.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/simple-managed-memory-rewrite.ll @@ -0,0 +1,71 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ +; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP: i32 MemRef_A[*]; + +; Check that we generate a constructor call for @A.toptr +; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr {{.*}}, ptr @A.toptr }] + +; Check that we generate a constructor +; 4 bytes * 100 = 400 +; HOST-IR: define void {{.*}}constructor() { +; HOST-IR-NEXT: entry: +; HOST-IR-NEXT: %mem.raw = call ptr @polly_mallocManaged(i64 400) +; HOST-IR-NEXT: store ptr %mem.raw, ptr @A.toptr +; HOST-IR-NEXT: ret void +; HOST-IR-NEXT: } + +; HOST-IR-NOT: @A + +source_filename = "test.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +@A = internal global [100 x i32] zeroinitializer, align 16 + +define void @f() { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [100 x i32], ptr @A, i64 0, i64 %indvars.iv1 + store i32 42, ptr %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/size-cast.ll b/polly/test/GPGPU/size-cast.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/size-cast.ll @@ -0,0 +1,63 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR + +; REQUIRES: pollyacc + +; This test case ensures that we properly sign-extend the types we are using. + +; CODE: if (arg >= 1 && arg1 == 0) { +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_arg3, MemRef_arg3, (arg) * sizeof(double), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(arg >= 1048545 ? 32768 : (arg + 31) / 32); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_arg3, dev_MemRef_arg2, arg, arg1); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_arg2, dev_MemRef_arg2, (arg) * sizeof(double), cudaMemcpyDeviceToHost)); +; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg3)); +; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg2)); + +; CODE: # kernel0 +; CODE-NEXT: for (int c0 = 0; c0 <= (arg - 32 * b0 - 1) / 1048576; c0 += 1) +; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1) +; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0); + +; IR-LABEL: call ptr @polly_initContextCUDA() +; IR: sext i32 %arg to i64 +; IR-NEXT: mul i64 +; IR-NEXT: @polly_allocateMemoryForDevice + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @hoge(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3) { +bb: + br label %bb4 + +bb4: ; preds = %bb13, %bb + br label %bb6 + +bb5: ; preds = %bb13 + ret void + +bb6: ; preds = %bb6, %bb4 + %tmp = phi i64 [ 0, %bb4 ], [ %tmp10, %bb6 ] + %tmp7 = getelementptr inbounds double, ptr %arg3, i64 %tmp + %tmp8 = load double, ptr %tmp7, align 8 + %tmp9 = getelementptr inbounds [1000 x double], ptr %arg2, i64 0, i64 %tmp + store double %tmp8, ptr %tmp9, align 8 + %tmp10 = add nuw nsw i64 %tmp, 1 + %tmp11 = zext i32 %arg to i64 + %tmp12 = icmp ne i64 %tmp10, %tmp11 + br i1 %tmp12, label %bb6, label %bb13 + +bb13: ; preds = %bb6 + %tmp14 = zext i32 %arg1 to i64 + %tmp15 = icmp ne i64 0, %tmp14 + br i1 %tmp15, label %bb4, label %bb5 +} diff --git a/polly/test/GPGPU/spir-codegen.ll b/polly/test/GPGPU/spir-codegen.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/spir-codegen.ll @@ -0,0 +1,118 @@ +; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ +; RUN: -polly-gpu-arch=spir32 \ +; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output -enable-new-pm=0 < %s | \ +; RUN: FileCheck %s + +; REQUIRES: pollyacc + +; CHECK: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +; CHECK-NEXT: target triple = "spir-unknown-unknown" + +; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = call i32 @__gen_ocl_get_group_id0() +; CHECK-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64 +; CHECK-NEXT: %1 = call i32 @__gen_ocl_get_group_id1() +; CHECK-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64 +; CHECK-NEXT: %2 = call i32 @__gen_ocl_get_local_id0() +; CHECK-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64 +; CHECK-NEXT: %3 = call i32 @__gen_ocl_get_local_id1() +; CHECK-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64 +; CHECK-NEXT: br label %polly.loop_preheader + +; CHECK-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 +; CHECK-NEXT: ret void + +; CHECK-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader +; CHECK-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] +; CHECK-NEXT: %4 = mul nsw i64 32, %__gen_ocl_get_group_id0 +; CHECK-NEXT: %5 = add nsw i64 %4, %__gen_ocl_get_local_id0 +; CHECK-NEXT: %6 = mul nsw i64 32, %__gen_ocl_get_group_id1 +; CHECK-NEXT: %7 = add nsw i64 %6, %__gen_ocl_get_local_id1 +; CHECK-NEXT: %8 = mul nsw i64 16, %polly.indvar +; CHECK-NEXT: %9 = add nsw i64 %7, %8 +; CHECK-NEXT: br label %polly.stmt.bb5 + +; CHECK-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header +; CHECK-NEXT: %10 = mul i64 %5, %9 +; CHECK-NEXT: %p_tmp6 = sitofp i64 %10 to float +; CHECK-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; CHECK-NEXT: %11 = mul nsw i64 32, %__gen_ocl_get_group_id0 +; CHECK-NEXT: %12 = add nsw i64 %11, %__gen_ocl_get_local_id0 +; CHECK-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024 +; CHECK-NEXT: %13 = mul nsw i64 32, %__gen_ocl_get_group_id1 +; CHECK-NEXT: %14 = add nsw i64 %13, %__gen_ocl_get_local_id1 +; CHECK-NEXT: %15 = mul nsw i64 16, %polly.indvar +; CHECK-NEXT: %16 = add nsw i64 %14, %15 +; CHECK-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16 +; CHECK-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A +; CHECK-NEXT: %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4 +; CHECK-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 +; CHECK-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* +; CHECK-NEXT: %17 = mul nsw i64 32, %__gen_ocl_get_group_id0 +; CHECK-NEXT: %18 = add nsw i64 %17, %__gen_ocl_get_local_id0 +; CHECK-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024 +; CHECK-NEXT: %19 = mul nsw i64 32, %__gen_ocl_get_group_id1 +; CHECK-NEXT: %20 = add nsw i64 %19, %__gen_ocl_get_local_id1 +; CHECK-NEXT: %21 = mul nsw i64 16, %polly.indvar +; CHECK-NEXT: %22 = add nsw i64 %20, %21 +; CHECK-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22 +; CHECK-NEXT: %polly.access.MemRef_A4 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3 +; CHECK-NEXT: store float %p_tmp9, float addrspace(1)* %polly.access.MemRef_A4, align 4 +; CHECK-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 +; CHECK-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1 +; CHECK-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit + +; CHECK-LABEL: polly.loop_preheader: ; preds = %entry +; CHECK-NEXT: br label %polly.loop_header + +; CHECK: attributes #0 = { "polly.skip.fn" } + +; void double_parallel_loop(float A[][1024]) { +; for (long i = 0; i < 1024; i++) +; for (long j = 0; j < 1024; j++) +; A[i][j] += i * j; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @double_parallel_loop([1024 x float]* %A) { +bb: + br label %bb2 + +bb2: ; preds = %bb13, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] + %exitcond1 = icmp ne i64 %i.0, 1024 + br i1 %exitcond1, label %bb3, label %bb15 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb10, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] + %exitcond = icmp ne i64 %j.0, 1024 + br i1 %exitcond, label %bb5, label %bb12 + +bb5: ; preds = %bb4 + %tmp = mul nuw nsw i64 %i.0, %j.0 + %tmp6 = sitofp i64 %tmp to float + %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0 + %tmp8 = load float, float* %tmp7, align 4 + %tmp9 = fadd float %tmp8, %tmp6 + store float %tmp9, float* %tmp7, align 4 + br label %bb10 + +bb10: ; preds = %bb5 + %tmp11 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb12: ; preds = %bb4 + br label %bb13 + +bb13: ; preds = %bb12 + %tmp14 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb15: ; preds = %bb2 + ret void +} diff --git a/polly/test/GPGPU/spir-typesize.ll b/polly/test/GPGPU/spir-typesize.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/spir-typesize.ll @@ -0,0 +1,90 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -polly-gpu-arch=spir64 \ +; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ +; RUN: FileCheck -check-prefix=I64 %s + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -polly-gpu-arch=spir32 \ +; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ +; RUN: FileCheck -check-prefix=I32 %s + +; REQUIRES: pollyacc + +; This test case checks whether the openCl runtime functions (get_local_id/get_group_id) return the right types for 32 and 64bit devices. + +; I32: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +; I32-NEXT: target triple = "spir-unknown-unknown" + +; I32-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { +; I32-NEXT: entry: +; I32-NEXT: %0 = call i32 @__gen_ocl_get_group_id0() +; I32-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64 +; I32-NEXT: %1 = call i32 @__gen_ocl_get_group_id1() +; I32-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64 +; I32-NEXT: %2 = call i32 @__gen_ocl_get_local_id0() +; I32-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64 +; I32-NEXT: %3 = call i32 @__gen_ocl_get_local_id1() +; I32-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64 +; I32-NEXT: br label %polly.loop_preheader + +; I64: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +; I64-next: target triple = "spir64-unknown-unknown" + +; I64-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { +; I64-NEXT: entry: +; I64-NEXT: %0 = call i64 @__gen_ocl_get_group_id0() +; I64-NEXT: %1 = call i64 @__gen_ocl_get_group_id1() +; I64-NEXT: %2 = call i64 @__gen_ocl_get_local_id0() +; I64-NEXT: %3 = call i64 @__gen_ocl_get_local_id1() +; I64-NEXT: br label %polly.loop_preheader + + +; void double_parallel_loop(float A[][1024]) { +; for (long i = 0; i < 1024; i++) +; for (long j = 0; j < 1024; j++) +; A[i][j] += i * j; +; } +; + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @double_parallel_loop(ptr %A) { +bb: + br label %bb2 + +bb2: ; preds = %bb13, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] + %exitcond1 = icmp ne i64 %i.0, 1024 + br i1 %exitcond1, label %bb3, label %bb15 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb10, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] + %exitcond = icmp ne i64 %j.0, 1024 + br i1 %exitcond, label %bb5, label %bb12 + +bb5: ; preds = %bb4 + %tmp = mul nuw nsw i64 %i.0, %j.0 + %tmp6 = sitofp i64 %tmp to float + %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0 + %tmp8 = load float, ptr %tmp7, align 4 + %tmp9 = fadd float %tmp8, %tmp6 + store float %tmp9, ptr %tmp7, align 4 + br label %bb10 + +bb10: ; preds = %bb5 + %tmp11 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb12: ; preds = %bb4 + br label %bb13 + +bb13: ; preds = %bb12 + %tmp14 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb15: ; preds = %bb2 + ret void +} diff --git a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll b/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll @@ -0,0 +1,82 @@ +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s + +; Check that we do not create a kernel if there is an +; unknown function call in a candidate kernel. + +; Check that we model the kernel as a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end13 + +; If a kernel were generated, then this code would have been part of the kernel +; and not the `.ll` file that is generated. +; CHECK: %conv = fpext float %0 to double +; CHECK-NEXT: %1 = tail call double @extern.fn(double %conv) +; CHECK-NEXT: %conv6 = fptrunc double %1 to float + +; REQUIRES: pollyacc + +; static const int N = 1000; +; void f(float A[N][N], int n, float B[N][N]) { +; for(int i = 0; i < n; i++) { +; for(int j = 0; j < n; j++) { +; B[i][j] = extern_fn(A[i][j], 3); +; } +; +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + +define void @f(ptr %A, i32 %n, ptr %B) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp3 = icmp sgt i32 %n, 0 + br i1 %cmp3, label %for.cond1.preheader.lr.ph, label %for.end13 + +for.cond1.preheader.lr.ph: ; preds = %entry.split + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc11 + %indvars.iv5 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next6, %for.inc11 ] + %cmp21 = icmp sgt i32 %n, 0 + br i1 %cmp21, label %for.body3.lr.ph, label %for.inc11 + +for.body3.lr.ph: ; preds = %for.cond1.preheader + br label %for.body3 + +for.body3: ; preds = %for.body3.lr.ph, %for.body3 + %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [1000 x float], ptr %A, i64 %indvars.iv5, i64 %indvars.iv + %0 = load float, ptr %arrayidx5, align 4 + %conv = fpext float %0 to double + %1 = tail call double @extern.fn(double %conv) + %conv6 = fptrunc double %1 to float + %arrayidx10 = getelementptr inbounds [1000 x float], ptr %B, i64 %indvars.iv5, i64 %indvars.iv + store float %conv6, ptr %arrayidx10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body3, label %for.cond1.for.inc11_crit_edge + +for.cond1.for.inc11_crit_edge: ; preds = %for.body3 + br label %for.inc11 + +for.inc11: ; preds = %for.cond1.for.inc11_crit_edge, %for.cond1.preheader + %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 + %wide.trip.count7 = zext i32 %n to i64 + %exitcond8 = icmp ne i64 %indvars.iv.next6, %wide.trip.count7 + br i1 %exitcond8, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge + +for.cond.for.end13_crit_edge: ; preds = %for.inc11 + br label %for.end13 + +for.end13: ; preds = %for.cond.for.end13_crit_edge, %entry.split + ret void +} + +declare double @extern.fn(double) #0 +attributes #0 = { readnone } diff --git a/polly/test/GPGPU/untouched-arrays.ll b/polly/test/GPGPU/untouched-arrays.ll new file mode 100644 --- /dev/null +++ b/polly/test/GPGPU/untouched-arrays.ll @@ -0,0 +1,270 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; REQUIRES: pollyacc + +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_global_1, MemRef_global_1, (142) * sizeof(i32), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(10); +; CODE-NEXT: dim3 k0_dimGrid(1); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_global_1); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_global_1, dev_MemRef_global_1, (142) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE: cudaCheckReturn(cudaFree(dev_MemRef_global_1)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb33(t0, 0); + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.hoge = type { [23 x i16], [22 x i16], [14 x i16], [13 x i16] } + +@global = external global [9 x %struct.hoge], align 16 +@global.1 = external global [9 x [152 x i32]], align 16 + +; Function Attrs: nounwind uwtable +define void @widget() #0 { +bb: + br label %bb1 + +bb1: ; preds = %bb1, %bb + br i1 undef, label %bb1, label %bb2 + +bb2: ; preds = %bb2, %bb1 + br i1 undef, label %bb2, label %bb3 + +bb3: ; preds = %bb3, %bb2 + br i1 undef, label %bb3, label %bb4 + +bb4: ; preds = %bb4, %bb3 + br i1 undef, label %bb4, label %bb5 + +bb5: ; preds = %bb5, %bb4 + br i1 undef, label %bb5, label %bb6 + +bb6: ; preds = %bb6, %bb5 + br i1 undef, label %bb6, label %bb7 + +bb7: ; preds = %bb7, %bb6 + br i1 undef, label %bb7, label %bb8 + +bb8: ; preds = %bb8, %bb7 + br i1 undef, label %bb8, label %bb9 + +bb9: ; preds = %bb8 + br label %bb10 + +bb10: ; preds = %bb12, %bb9 + br label %bb11 + +bb11: ; preds = %bb11, %bb10 + br i1 undef, label %bb11, label %bb12 + +bb12: ; preds = %bb11 + br i1 undef, label %bb10, label %bb13 + +bb13: ; preds = %bb18, %bb12 + br i1 undef, label %bb16, label %bb14 + +bb14: ; preds = %bb16, %bb13 + br i1 undef, label %bb15, label %bb18 + +bb15: ; preds = %bb14 + br label %bb17 + +bb16: ; preds = %bb16, %bb13 + br i1 undef, label %bb16, label %bb14 + +bb17: ; preds = %bb17, %bb15 + br i1 undef, label %bb17, label %bb18 + +bb18: ; preds = %bb17, %bb14 + br i1 undef, label %bb13, label %bb19 + +bb19: ; preds = %bb25, %bb18 + br label %bb20 + +bb20: ; preds = %bb24, %bb19 + br i1 undef, label %bb21, label %bb24 + +bb21: ; preds = %bb20 + br i1 undef, label %bb23, label %bb22 + +bb22: ; preds = %bb21 + br label %bb24 + +bb23: ; preds = %bb21 + br label %bb24 + +bb24: ; preds = %bb23, %bb22, %bb20 + br i1 undef, label %bb20, label %bb25 + +bb25: ; preds = %bb24 + br i1 undef, label %bb19, label %bb26 + +bb26: ; preds = %bb56, %bb25 + %tmp = phi ptr [ undef, %bb56 ], [ getelementptr inbounds ([9 x [152 x i32]], ptr @global.1, i64 0, i64 0, i64 32), %bb25 ] + br label %bb27 + +bb27: ; preds = %bb27, %bb26 + br i1 undef, label %bb27, label %bb28 + +bb28: ; preds = %bb27 + br label %bb30 + +bb30: ; preds = %bb38, %bb28 + %tmp31 = phi i32 [ 3, %bb28 ], [ %tmp40, %bb38 ] + %tmp32 = phi ptr [ %tmp, %bb28 ], [ %tmp39, %bb38 ] + br label %bb33 + +bb33: ; preds = %bb33, %bb30 + %tmp34 = phi i32 [ 0, %bb30 ], [ %tmp37, %bb33 ] + %tmp35 = phi ptr [ %tmp32, %bb30 ], [ undef, %bb33 ] + %tmp36 = getelementptr inbounds i32, ptr %tmp35, i64 1 + store i32 undef, ptr %tmp36, align 4, !tbaa !1 + %tmp37 = add nuw nsw i32 %tmp34, 1 + br i1 false, label %bb33, label %bb38 + +bb38: ; preds = %bb33 + %tmp39 = getelementptr i32, ptr %tmp32, i64 12 + %tmp40 = add nuw nsw i32 %tmp31, 1 + %tmp41 = icmp ne i32 %tmp40, 13 + br i1 %tmp41, label %bb30, label %bb42 + +bb42: ; preds = %bb38 + %tmp43 = getelementptr inbounds [9 x %struct.hoge], ptr @global, i64 0, i64 0, i32 3, i64 0 + br label %bb44 + +bb44: ; preds = %bb51, %bb42 + %tmp45 = phi i32 [ 0, %bb42 ], [ %tmp52, %bb51 ] + %tmp46 = phi ptr [ %tmp43, %bb42 ], [ undef, %bb51 ] + %tmp47 = load i16, ptr %tmp46, align 2, !tbaa !5 + br label %bb48 + +bb48: ; preds = %bb48, %bb44 + %tmp49 = phi i32 [ 0, %bb44 ], [ %tmp50, %bb48 ] + %tmp50 = add nuw nsw i32 %tmp49, 1 + br i1 false, label %bb48, label %bb51 + +bb51: ; preds = %bb48 + %tmp52 = add nuw nsw i32 %tmp45, 1 + %tmp53 = icmp ne i32 %tmp52, 13 + br i1 %tmp53, label %bb44, label %bb54 + +bb54: ; preds = %bb51 + br label %bb55 + +bb55: ; preds = %bb55, %bb54 + br i1 undef, label %bb55, label %bb56 + +bb56: ; preds = %bb55 + br i1 undef, label %bb26, label %bb57 + +bb57: ; preds = %bb60, %bb56 + br label %bb58 + +bb58: ; preds = %bb58, %bb57 + br i1 undef, label %bb58, label %bb59 + +bb59: ; preds = %bb59, %bb58 + br i1 undef, label %bb59, label %bb60 + +bb60: ; preds = %bb59 + br i1 undef, label %bb57, label %bb61 + +bb61: ; preds = %bb65, %bb60 + br label %bb62 + +bb62: ; preds = %bb64, %bb61 + br label %bb63 + +bb63: ; preds = %bb63, %bb62 + br i1 undef, label %bb63, label %bb64 + +bb64: ; preds = %bb63 + br i1 undef, label %bb62, label %bb65 + +bb65: ; preds = %bb64 + br i1 undef, label %bb61, label %bb66 + +bb66: ; preds = %bb70, %bb65 + br label %bb67 + +bb67: ; preds = %bb69, %bb66 + br label %bb68 + +bb68: ; preds = %bb68, %bb67 + br i1 undef, label %bb68, label %bb69 + +bb69: ; preds = %bb68 + br i1 undef, label %bb67, label %bb70 + +bb70: ; preds = %bb69 + br i1 undef, label %bb66, label %bb71 + +bb71: ; preds = %bb73, %bb70 + br label %bb72 + +bb72: ; preds = %bb72, %bb71 + br i1 undef, label %bb72, label %bb73 + +bb73: ; preds = %bb72 + br i1 undef, label %bb71, label %bb74 + +bb74: ; preds = %bb80, %bb73 + br label %bb75 + +bb75: ; preds = %bb79, %bb74 + br label %bb76 + +bb76: ; preds = %bb78, %bb75 + br label %bb77 + +bb77: ; preds = %bb77, %bb76 + br i1 undef, label %bb77, label %bb78 + +bb78: ; preds = %bb77 + br i1 undef, label %bb76, label %bb79 + +bb79: ; preds = %bb78 + br i1 undef, label %bb75, label %bb80 + +bb80: ; preds = %bb79 + br i1 undef, label %bb74, label %bb81 + +bb81: ; preds = %bb85, %bb80 + br label %bb82 + +bb82: ; preds = %bb84, %bb81 + br label %bb83 + +bb83: ; preds = %bb83, %bb82 + br i1 undef, label %bb83, label %bb84 + +bb84: ; preds = %bb83 + br i1 undef, label %bb82, label %bb85 + +bb85: ; preds = %bb84 + br i1 undef, label %bb81, label %bb86 + +bb86: ; preds = %bb85 + ret void +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 4.0.0"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!6, !6, i64 0} +!6 = !{!"short", !3, i64 0} diff --git a/polly/test/Unit/lit.site.cfg.in b/polly/test/Unit/lit.site.cfg.in --- a/polly/test/Unit/lit.site.cfg.in +++ b/polly/test/Unit/lit.site.cfg.in @@ -11,6 +11,7 @@ config.polly_lib_dir = "@POLLY_LIB_DIR@" config.shlibdir = "@SHLIBDIR@" config.target_triple = "@LLVM_TARGET_TRIPLE@" +config.enable_gpgpu_codegen = "@GPU_CODEGEN@" config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@" config.has_unittests = @POLLY_GTEST_AVAIL@ diff --git a/polly/test/lit.cfg b/polly/test/lit.cfg --- a/polly/test/lit.cfg +++ b/polly/test/lit.cfg @@ -70,4 +70,6 @@ print("Could not find llvm-config in " + config.llvm_tools_dir) exit(42) +if re.search(r'NVPTX', llvm_config_cmd.stdout.read().decode('ascii')): + config.available_features.add('nvptx-registered-target') llvm_config_cmd.wait() diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in --- a/polly/test/lit.site.cfg.in +++ b/polly/test/lit.site.cfg.in @@ -7,6 +7,7 @@ config.polly_obj_root = "@POLLY_BINARY_DIR@" config.polly_lib_dir = "@POLLY_LIB_DIR@" config.target_triple = "@LLVM_TARGET_TRIPLE@" +config.enable_gpgpu_codegen = "@GPU_CODEGEN@" config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@" config.targets_to_build = "@TARGETS_TO_BUILD@" config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";") @@ -49,6 +50,9 @@ config.substitutions.append(('%loadNPMPolly', commonOpts )) +if config.enable_gpgpu_codegen == 'TRUE' : + config.available_features.add('pollyacc') + import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/polly/tools/CMakeLists.txt b/polly/tools/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/polly/tools/CMakeLists.txt @@ -0,0 +1,5 @@ +if (CUDA_FOUND OR OpenCL_FOUND) + add_subdirectory(GPURuntime) +endif (CUDA_FOUND OR OpenCL_FOUND) + +set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) diff --git a/polly/tools/GPURuntime/CMakeLists.txt b/polly/tools/GPURuntime/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/polly/tools/GPURuntime/CMakeLists.txt @@ -0,0 +1,19 @@ +set(MODULE TRUE) +set(LLVM_NO_RTTI 1) + +add_polly_library(GPURuntime + GPUJIT.c + ) + +set_target_properties(GPURuntime + PROPERTIES + LINKER_LANGUAGE C + PREFIX "lib" + ) + +set_property(TARGET GPURuntime PROPERTY C_STANDARD 99) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=default ") +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-sanitize=all ") +endif() diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h new file mode 100644 --- /dev/null +++ b/polly/tools/GPURuntime/GPUJIT.h @@ -0,0 +1,123 @@ +/******************************************************************************/ +/* */ +/* Part of the LLVM Project, under the Apache License v2.0 with LLVM */ +/* Exceptions. */ +/* See https://llvm.org/LICENSE.txt for license information. */ +/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ +/* */ +/******************************************************************************/ +/* */ +/* This file defines GPUJIT. */ +/* */ +/******************************************************************************/ + +#ifndef GPUJIT_H_ +#define GPUJIT_H_ +#include "stddef.h" + +/* + * The following demonstrates how we can use the GPURuntime library to + * execute a GPU kernel. + * + * char KernelString[] = "\n\ + * .version 1.4\n\ + * .target sm_10, map_f64_to_f32\n\ + * .entry _Z8myKernelPi (\n\ + * .param .u64 __cudaparm__Z8myKernelPi_data)\n\ + * {\n\ + * .reg .u16 %rh<4>;\n\ + * .reg .u32 %r<5>;\n\ + * .reg .u64 %rd<6>;\n\ + * cvt.u32.u16 %r1, %tid.x;\n\ + * mov.u16 %rh1, %ctaid.x;\n\ + * mov.u16 %rh2, %ntid.x;\n\ + * mul.wide.u16 %r2, %rh1, %rh2;\n\ + * add.u32 %r3, %r1, %r2;\n\ + * ld.param.u64 %rd1, [__cudaparm__Z8myKernelPi_data];\n\ + * cvt.s64.s32 %rd2, %r3;\n\ + * mul.wide.s32 %rd3, %r3, 4;\n\ + * add.u64 %rd4, %rd1, %rd3;\n\ + * st.global.s32 [%rd4+0], %r3;\n\ + * exit;\n\ + * }\n\ + * "; + * + * const char *Entry = "_Z8myKernelPi"; + * + * int main() { + * PollyGPUFunction *Kernel; + * PollyGPUContext *Context; + * PollyGPUDevicePtr *DevArray; + * int *HostData; + * int MemSize; + * + * int GridX = 8; + * int GridY = 8; + * + * int BlockX = 16; + * int BlockY = 16; + * int BlockZ = 1; + * + * MemSize = 256*64*sizeof(int); + * Context = polly_initContext(); + * DevArray = polly_allocateMemoryForDevice(MemSize); + * Kernel = polly_getKernel(KernelString, KernelName); + * + * void *Params[1]; + * void *DevPtr = polly_getDevicePtr(DevArray) + * Params[0] = &DevPtr; + * + * polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params); + * + * polly_copyFromDeviceToHost(HostData, DevData, MemSize); + * polly_freeKernel(Kernel); + * polly_freeDeviceMemory(DevArray); + * polly_freeContext(Context); + * } + * + */ + +typedef enum PollyGPURuntimeT { + RUNTIME_NONE, + RUNTIME_CUDA, + RUNTIME_CL +} PollyGPURuntime; + +typedef struct PollyGPUContextT PollyGPUContext; +typedef struct PollyGPUFunctionT PollyGPUFunction; +typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; + +typedef struct OpenCLContextT OpenCLContext; +typedef struct OpenCLKernelT OpenCLKernel; +typedef struct OpenCLDevicePtrT OpenCLDevicePtr; + +typedef struct CUDAContextT CUDAContext; +typedef struct CUDAKernelT CUDAKernel; +typedef struct CUDADevicePtrT CUDADevicePtr; + +PollyGPUContext *polly_initContextCUDA(); +PollyGPUContext *polly_initContextCL(); +PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, + const char *KernelName); +void polly_freeKernel(PollyGPUFunction *Kernel); +void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize); +void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize); +void polly_synchronizeDevice(); +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockSizeX, + unsigned int BlockSizeY, unsigned int BlockSizeZ, + void **Parameters); +void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation); +void polly_freeContext(PollyGPUContext *Context); + +// Note that polly_{malloc/free}Managed are currently not used by Polly. +// We use them in COSMO by replacing all malloc with polly_mallocManaged and all +// frees with cudaFree, so we can get managed memory "automatically". +// Needless to say, this is a hack. +// Please make sure that this code is not present in Polly when 2018 rolls in. +// If this is still present, ping Siddharth Bhat +void *polly_mallocManaged(size_t size); +void polly_freeManaged(void *mem); +#endif /* GPUJIT_H_ */ diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c new file mode 100644 --- /dev/null +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -0,0 +1,1856 @@ +/******************** GPUJIT.c - GPUJIT Execution Engine **********************/ +/* */ +/* Part of the LLVM Project, under the Apache License v2.0 with LLVM */ +/* Exceptions. */ +/* See https://llvm.org/LICENSE.txt for license information. */ +/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ +/* */ +/******************************************************************************/ +/* */ +/* This file implements GPUJIT, a ptx string execution engine for GPU. */ +/* */ +/******************************************************************************/ + +#include "GPUJIT.h" + +#ifdef HAS_LIBCUDART +#include +#include +#endif /* HAS_LIBCUDART */ + +#ifdef HAS_LIBOPENCL +#ifdef __APPLE__ +#include +#else +#include +#endif /* __APPLE__ */ +#endif /* HAS_LIBOPENCL */ + +#include +#include +#include +#include +#include +#include +#include + +static int DebugMode; +static int CacheMode; +#define max(x, y) ((x) > (y) ? (x) : (y)) + +static PollyGPURuntime Runtime = RUNTIME_NONE; + +static void debug_print(const char *format, ...) { + if (!DebugMode) + return; + + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); +} +#define dump_function() debug_print("-> %s\n", __func__) + +#define KERNEL_CACHE_SIZE 10 + +static void err_runtime() __attribute__((noreturn)); +static void err_runtime() { + fprintf(stderr, "Runtime not correctly initialized.\n"); + exit(-1); +} + +struct PollyGPUContextT { + void *Context; +}; + +struct PollyGPUFunctionT { + void *Kernel; +}; + +struct PollyGPUDevicePtrT { + void *DevicePtr; +}; + +/******************************************************************************/ +/* OpenCL */ +/******************************************************************************/ +#ifdef HAS_LIBOPENCL + +struct OpenCLContextT { + cl_context Context; + cl_command_queue CommandQueue; +}; + +struct OpenCLKernelT { + cl_kernel Kernel; + cl_program Program; + const char *BinaryString; +}; + +struct OpenCLDevicePtrT { + cl_mem MemObj; +}; + +/* Dynamic library handles for the OpenCL runtime library. */ +static void *HandleOpenCL; +static void *HandleOpenCLBeignet; + +/* Type-defines of function pointer to OpenCL Runtime API. */ +typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, + cl_platform_id *Platforms, + cl_uint *NumPlatforms); +static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; + +typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform, + cl_device_type DeviceType, + cl_uint NumEntries, cl_device_id *Devices, + cl_uint *NumDevices); +static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; + +typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device, + cl_device_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet); +static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; + +typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet); +static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; + +typedef cl_context clCreateContextFcnTy( + const cl_context_properties *Properties, cl_uint NumDevices, + const cl_device_id *Devices, + void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo, + size_t CB, void *UserData), + void *UserData, cl_int *ErrcodeRet); +static clCreateContextFcnTy *clCreateContextFcnPtr; + +typedef cl_command_queue +clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device, + cl_command_queue_properties Properties, + cl_int *ErrcodeRet); +static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; + +typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags, + size_t Size, void *HostPtr, + cl_int *ErrcodeRet); +static clCreateBufferFcnTy *clCreateBufferFcnPtr; + +typedef cl_int +clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer, + cl_bool BlockingWrite, size_t Offset, size_t Size, + const void *Ptr, cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, cl_event *Event); +static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; + +typedef cl_program +clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices, + const cl_device_id *DeviceList, + const char *Filename, cl_int *ErrcodeRet); +static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr; + +typedef cl_program clCreateProgramWithBinaryFcnTy( + cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, + const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, + cl_int *ErrcodeRet); +static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; + +typedef cl_int clBuildProgramFcnTy( + cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList, + const char *Options, + void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData), + void *UserData); +static clBuildProgramFcnTy *clBuildProgramFcnPtr; + +typedef cl_kernel clCreateKernelFcnTy(cl_program Program, + const char *KernelName, + cl_int *ErrcodeRet); +static clCreateKernelFcnTy *clCreateKernelFcnPtr; + +typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex, + size_t ArgSize, const void *ArgValue); +static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; + +typedef cl_int clEnqueueNDRangeKernelFcnTy( + cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, cl_event *Event); +static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; + +typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue, + cl_mem Buffer, cl_bool BlockingRead, + size_t Offset, size_t Size, void *Ptr, + cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, + cl_event *Event); +static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; + +typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue); +static clFlushFcnTy *clFlushFcnPtr; + +typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue); +static clFinishFcnTy *clFinishFcnPtr; + +typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel); +static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; + +typedef cl_int clReleaseProgramFcnTy(cl_program Program); +static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; + +typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject); +static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; + +typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue); +static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; + +typedef cl_int clReleaseContextFcnTy(cl_context Context); +static clReleaseContextFcnTy *clReleaseContextFcnPtr; + +static void *getAPIHandleCL(void *Handle, const char *FuncName) { + char *Err; + void *FuncPtr; + dlerror(); + FuncPtr = dlsym(Handle, FuncName); + if ((Err = dlerror()) != 0) { + fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err); + return 0; + } + return FuncPtr; +} + +static int initialDeviceAPILibrariesCL() { + HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY); + HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); + if (!HandleOpenCL) { + fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + return 0; + } + return 1; +} + +/* Get function pointer to OpenCL Runtime API. + * + * Note that compilers conforming to the ISO C standard are required to + * generate a warning if a conversion from a void * pointer to a function + * pointer is attempted as in the following statements. The warning + * of this kind of cast may not be emitted by clang and new versions of gcc + * as it is valid on POSIX 2008. For compilers required to generate a warning, + * we temporarily disable -Wpedantic, to avoid bloating the output with + * unnecessary warnings. + * + * Reference: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +static int initialDeviceAPIsCL() { + if (initialDeviceAPILibrariesCL() == 0) + return 0; + + // FIXME: We are now always selecting the Intel Beignet driver if it is + // available on the system, instead of a possible NVIDIA or AMD OpenCL + // API. This selection should occurr based on the target architecture + // chosen when compiling. + void *Handle = + (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL); + + clGetPlatformIDsFcnPtr = + (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs"); + + clGetDeviceIDsFcnPtr = + (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs"); + + clGetDeviceInfoFcnPtr = + (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo"); + + clGetKernelInfoFcnPtr = + (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo"); + + clCreateContextFcnPtr = + (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext"); + + clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( + Handle, "clCreateCommandQueue"); + + clCreateBufferFcnPtr = + (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer"); + + clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( + Handle, "clEnqueueWriteBuffer"); + + if (HandleOpenCLBeignet) + clCreateProgramWithLLVMIntelFcnPtr = + (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL( + Handle, "clCreateProgramWithLLVMIntel"); + + clCreateProgramWithBinaryFcnPtr = + (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( + Handle, "clCreateProgramWithBinary"); + + clBuildProgramFcnPtr = + (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram"); + + clCreateKernelFcnPtr = + (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel"); + + clSetKernelArgFcnPtr = + (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg"); + + clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( + Handle, "clEnqueueNDRangeKernel"); + + clEnqueueReadBufferFcnPtr = + (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer"); + + clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush"); + + clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish"); + + clReleaseKernelFcnPtr = + (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel"); + + clReleaseProgramFcnPtr = + (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram"); + + clReleaseMemObjectFcnPtr = + (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject"); + + clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( + Handle, "clReleaseCommandQueue"); + + clReleaseContextFcnPtr = + (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext"); + + return 1; +} +#pragma GCC diagnostic pop + +/* Context and Device. */ +static PollyGPUContext *GlobalContext = NULL; +static cl_device_id GlobalDeviceID = NULL; + +/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ +static void printOpenCLError(int Error); + +static void checkOpenCLError(int Ret, const char *format, ...) { + if (Ret == CL_SUCCESS) + return; + + printOpenCLError(Ret); + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + exit(-1); +} + +static PollyGPUContext *initContextCL() { + dump_function(); + + PollyGPUContext *Context; + + cl_platform_id PlatformID = NULL; + cl_device_id DeviceID = NULL; + cl_uint NumDevicesRet; + cl_int Ret; + + char DeviceRevision[256]; + char DeviceName[256]; + size_t DeviceRevisionRetSize, DeviceNameRetSize; + + static __thread PollyGPUContext *CurrentContext = NULL; + + if (CurrentContext) + return CurrentContext; + + /* Get API handles. */ + if (initialDeviceAPIsCL() == 0) { + fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); + exit(-1); + } + + /* Get number of devices that support OpenCL. */ + static const int NumberOfPlatforms = 1; + Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL); + checkOpenCLError(Ret, "Failed to get platform IDs.\n"); + // TODO: Extend to CL_DEVICE_TYPE_ALL? + static const int NumberOfDevices = 1; + Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices, + &DeviceID, &NumDevicesRet); + checkOpenCLError(Ret, "Failed to get device IDs.\n"); + + GlobalDeviceID = DeviceID; + if (NumDevicesRet == 0) { + fprintf(stderr, "There is no device supporting OpenCL.\n"); + exit(-1); + } + + /* Get device revision. */ + Ret = + clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision), + DeviceRevision, &DeviceRevisionRetSize); + checkOpenCLError(Ret, "Failed to fetch device revision.\n"); + + /* Get device name. */ + Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName), + DeviceName, &DeviceNameRetSize); + checkOpenCLError(Ret, "Failed to fetch device name.\n"); + + debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); + + /* Create context on the device. */ + Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); + if (Context == 0) { + fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); + exit(-1); + } + Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); + if (Context->Context == 0) { + fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n"); + exit(-1); + } + ((OpenCLContext *)Context->Context)->Context = + clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret); + checkOpenCLError(Ret, "Failed to create context.\n"); + + static const int ExtraProperties = 0; + ((OpenCLContext *)Context->Context)->CommandQueue = + clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context, + DeviceID, ExtraProperties, &Ret); + checkOpenCLError(Ret, "Failed to create command queue.\n"); + + if (CacheMode) + CurrentContext = Context; + + GlobalContext = Context; + return Context; +} + +static void freeKernelCL(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); + checkOpenCLError(Ret, "Failed to flush command queue.\n"); + Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); + checkOpenCLError(Ret, "Failed to finish command queue.\n"); + + if (((OpenCLKernel *)Kernel->Kernel)->Kernel) { + cl_int Ret = + clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel); + checkOpenCLError(Ret, "Failed to release kernel.\n"); + } + + if (((OpenCLKernel *)Kernel->Kernel)->Program) { + cl_int Ret = + clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program); + checkOpenCLError(Ret, "Failed to release program.\n"); + } + + if (Kernel->Kernel) + free((OpenCLKernel *)Kernel->Kernel); + + if (Kernel) + free(Kernel); +} + +static PollyGPUFunction *getKernelCL(const char *BinaryBuffer, + const char *KernelName) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; + static __thread int NextCacheItem = 0; + + for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { + // We exploit here the property that all Polly-ACC kernels are allocated + // as global constants, hence a pointer comparision is sufficient to + // determin equality. + if (KernelCache[i] && + ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString == + BinaryBuffer) { + debug_print(" -> using cached kernel\n"); + return KernelCache[i]; + } + } + + PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + if (Function == 0) { + fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); + exit(-1); + } + Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); + if (Function->Kernel == 0) { + fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n"); + exit(-1); + } + + if (!GlobalDeviceID) { + fprintf(stderr, "GPGPU-code generation not initialized correctly.\n"); + exit(-1); + } + + cl_int Ret; + + if (HandleOpenCLBeignet) { + // This is a workaround, since clCreateProgramWithLLVMIntel only + // accepts a filename to a valid llvm-ir file as an argument, instead + // of accepting the BinaryBuffer directly. + char FileName[] = "/tmp/polly_kernelXXXXXX"; + int File = mkstemp(FileName); + write(File, BinaryBuffer, strlen(BinaryBuffer)); + + ((OpenCLKernel *)Function->Kernel)->Program = + clCreateProgramWithLLVMIntelFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, + &GlobalDeviceID, FileName, &Ret); + checkOpenCLError(Ret, "Failed to create program from llvm.\n"); + close(File); + unlink(FileName); + } else { + size_t BinarySize = strlen(BinaryBuffer); + ((OpenCLKernel *)Function->Kernel)->Program = + clCreateProgramWithBinaryFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, + &GlobalDeviceID, (const size_t *)&BinarySize, + (const unsigned char **)&BinaryBuffer, NULL, &Ret); + checkOpenCLError(Ret, "Failed to create program from binary.\n"); + } + + Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, + &GlobalDeviceID, NULL, NULL, NULL); + checkOpenCLError(Ret, "Failed to build program.\n"); + + ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr( + ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret); + checkOpenCLError(Ret, "Failed to create kernel.\n"); + + ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer; + + if (CacheMode) { + if (KernelCache[NextCacheItem]) + freeKernelCL(KernelCache[NextCacheItem]); + + KernelCache[NextCacheItem] = Function; + + NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; + } + + return Function; +} + +static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clEnqueueWriteBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->CommandQueue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + checkOpenCLError(Ret, "Copying data from host memory to device failed.\n"); +} + +static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clEnqueueReadBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->CommandQueue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + checkOpenCLError(Ret, "Copying results from device to host memory failed.\n"); +} + +static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + cl_int Ret; + cl_uint NumArgs; + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel; + Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS, + sizeof(cl_uint), &NumArgs, NULL); + checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); + + /* Argument sizes are stored at the end of the Parameters array. */ + for (cl_uint i = 0; i < NumArgs; i++) { + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, + *((int *)Parameters[NumArgs + i]), + (void *)Parameters[i]); + checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); + } + + unsigned int GridDimZ = 1; + size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY, + BlockDimZ * GridDimZ}; + size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ}; + + static const int WorkDim = 3; + OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; + Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel, + WorkDim, NULL, GlobalWorkSize, + LocalWorkSize, 0, NULL, NULL); + checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n"); +} + +static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj); + checkOpenCLError(Ret, "Failed to free device memory.\n"); + + free(DevPtr); + free(Allocation); +} + +static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); + if (DevData == 0) { + fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + + cl_int Ret; + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj = + clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context, + CL_MEM_READ_WRITE, MemSize, NULL, &Ret); + checkOpenCLError(Ret, + "Allocate memory for GPU device memory pointer failed.\n"); + + return DevData; +} + +static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->MemObj; +} + +static void synchronizeDeviceCL() { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) != + CL_SUCCESS) { + fprintf(stderr, "Synchronizing device and host memory failed.\n"); + exit(-1); + } +} + +static void freeContextCL(PollyGPUContext *Context) { + dump_function(); + + cl_int Ret; + + GlobalContext = NULL; + + OpenCLContext *Ctx = (OpenCLContext *)Context->Context; + if (Ctx->CommandQueue) { + Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue); + checkOpenCLError(Ret, "Could not release command queue.\n"); + } + + if (Ctx->Context) { + Ret = clReleaseContextFcnPtr(Ctx->Context); + checkOpenCLError(Ret, "Could not release context.\n"); + } + + free(Ctx); + free(Context); +} + +static void printOpenCLError(int Error) { + + switch (Error) { + case CL_SUCCESS: + // Success, don't print an error. + break; + + // JIT/Runtime errors. + case CL_DEVICE_NOT_FOUND: + fprintf(stderr, "Device not found.\n"); + break; + case CL_DEVICE_NOT_AVAILABLE: + fprintf(stderr, "Device not available.\n"); + break; + case CL_COMPILER_NOT_AVAILABLE: + fprintf(stderr, "Compiler not available.\n"); + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + fprintf(stderr, "Mem object allocation failure.\n"); + break; + case CL_OUT_OF_RESOURCES: + fprintf(stderr, "Out of resources.\n"); + break; + case CL_OUT_OF_HOST_MEMORY: + fprintf(stderr, "Out of host memory.\n"); + break; + case CL_PROFILING_INFO_NOT_AVAILABLE: + fprintf(stderr, "Profiling info not available.\n"); + break; + case CL_MEM_COPY_OVERLAP: + fprintf(stderr, "Mem copy overlap.\n"); + break; + case CL_IMAGE_FORMAT_MISMATCH: + fprintf(stderr, "Image format mismatch.\n"); + break; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + fprintf(stderr, "Image format not supported.\n"); + break; + case CL_BUILD_PROGRAM_FAILURE: + fprintf(stderr, "Build program failure.\n"); + break; + case CL_MAP_FAILURE: + fprintf(stderr, "Map failure.\n"); + break; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + fprintf(stderr, "Misaligned sub buffer offset.\n"); + break; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + fprintf(stderr, "Exec status error for events in wait list.\n"); + break; + case CL_COMPILE_PROGRAM_FAILURE: + fprintf(stderr, "Compile program failure.\n"); + break; + case CL_LINKER_NOT_AVAILABLE: + fprintf(stderr, "Linker not available.\n"); + break; + case CL_LINK_PROGRAM_FAILURE: + fprintf(stderr, "Link program failure.\n"); + break; + case CL_DEVICE_PARTITION_FAILED: + fprintf(stderr, "Device partition failed.\n"); + break; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + fprintf(stderr, "Kernel arg info not available.\n"); + break; + + // Compiler errors. + case CL_INVALID_VALUE: + fprintf(stderr, "Invalid value.\n"); + break; + case CL_INVALID_DEVICE_TYPE: + fprintf(stderr, "Invalid device type.\n"); + break; + case CL_INVALID_PLATFORM: + fprintf(stderr, "Invalid platform.\n"); + break; + case CL_INVALID_DEVICE: + fprintf(stderr, "Invalid device.\n"); + break; + case CL_INVALID_CONTEXT: + fprintf(stderr, "Invalid context.\n"); + break; + case CL_INVALID_QUEUE_PROPERTIES: + fprintf(stderr, "Invalid queue properties.\n"); + break; + case CL_INVALID_COMMAND_QUEUE: + fprintf(stderr, "Invalid command queue.\n"); + break; + case CL_INVALID_HOST_PTR: + fprintf(stderr, "Invalid host pointer.\n"); + break; + case CL_INVALID_MEM_OBJECT: + fprintf(stderr, "Invalid memory object.\n"); + break; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + fprintf(stderr, "Invalid image format descriptor.\n"); + break; + case CL_INVALID_IMAGE_SIZE: + fprintf(stderr, "Invalid image size.\n"); + break; + case CL_INVALID_SAMPLER: + fprintf(stderr, "Invalid sampler.\n"); + break; + case CL_INVALID_BINARY: + fprintf(stderr, "Invalid binary.\n"); + break; + case CL_INVALID_BUILD_OPTIONS: + fprintf(stderr, "Invalid build options.\n"); + break; + case CL_INVALID_PROGRAM: + fprintf(stderr, "Invalid program.\n"); + break; + case CL_INVALID_PROGRAM_EXECUTABLE: + fprintf(stderr, "Invalid program executable.\n"); + break; + case CL_INVALID_KERNEL_NAME: + fprintf(stderr, "Invalid kernel name.\n"); + break; + case CL_INVALID_KERNEL_DEFINITION: + fprintf(stderr, "Invalid kernel definition.\n"); + break; + case CL_INVALID_KERNEL: + fprintf(stderr, "Invalid kernel.\n"); + break; + case CL_INVALID_ARG_INDEX: + fprintf(stderr, "Invalid arg index.\n"); + break; + case CL_INVALID_ARG_VALUE: + fprintf(stderr, "Invalid arg value.\n"); + break; + case CL_INVALID_ARG_SIZE: + fprintf(stderr, "Invalid arg size.\n"); + break; + case CL_INVALID_KERNEL_ARGS: + fprintf(stderr, "Invalid kernel args.\n"); + break; + case CL_INVALID_WORK_DIMENSION: + fprintf(stderr, "Invalid work dimension.\n"); + break; + case CL_INVALID_WORK_GROUP_SIZE: + fprintf(stderr, "Invalid work group size.\n"); + break; + case CL_INVALID_WORK_ITEM_SIZE: + fprintf(stderr, "Invalid work item size.\n"); + break; + case CL_INVALID_GLOBAL_OFFSET: + fprintf(stderr, "Invalid global offset.\n"); + break; + case CL_INVALID_EVENT_WAIT_LIST: + fprintf(stderr, "Invalid event wait list.\n"); + break; + case CL_INVALID_EVENT: + fprintf(stderr, "Invalid event.\n"); + break; + case CL_INVALID_OPERATION: + fprintf(stderr, "Invalid operation.\n"); + break; + case CL_INVALID_GL_OBJECT: + fprintf(stderr, "Invalid GL object.\n"); + break; + case CL_INVALID_BUFFER_SIZE: + fprintf(stderr, "Invalid buffer size.\n"); + break; + case CL_INVALID_MIP_LEVEL: + fprintf(stderr, "Invalid mip level.\n"); + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + fprintf(stderr, "Invalid global work size.\n"); + break; + case CL_INVALID_PROPERTY: + fprintf(stderr, "Invalid property.\n"); + break; + case CL_INVALID_IMAGE_DESCRIPTOR: + fprintf(stderr, "Invalid image descriptor.\n"); + break; + case CL_INVALID_COMPILER_OPTIONS: + fprintf(stderr, "Invalid compiler options.\n"); + break; + case CL_INVALID_LINKER_OPTIONS: + fprintf(stderr, "Invalid linker options.\n"); + break; + case CL_INVALID_DEVICE_PARTITION_COUNT: + fprintf(stderr, "Invalid device partition count.\n"); + break; + case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE + fprintf(stderr, "Invalid pipe size.\n"); + break; + case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE + fprintf(stderr, "Invalid device queue.\n"); + break; + + // NVIDIA specific error. + case -9999: + fprintf(stderr, "NVIDIA invalid read or write buffer.\n"); + break; + + default: + fprintf(stderr, "Unknown error code!\n"); + break; + } +} + +#endif /* HAS_LIBOPENCL */ +/******************************************************************************/ +/* CUDA */ +/******************************************************************************/ +#ifdef HAS_LIBCUDART + +struct CUDAContextT { + CUcontext Cuda; +}; + +struct CUDAKernelT { + CUfunction Cuda; + CUmodule CudaModule; + const char *BinaryString; +}; + +struct CUDADevicePtrT { + CUdeviceptr Cuda; +}; + +/* Dynamic library handles for the CUDA and CUDA runtime library. */ +static void *HandleCuda; +static void *HandleCudaRT; + +/* Type-defines of function pointer to CUDA driver APIs. */ +typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); +static CuMemAllocFcnTy *CuMemAllocFcnPtr; + +typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t, + unsigned int); +static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr; + +typedef CUresult CUDAAPI CuLaunchKernelFcnTy( + CUfunction F, unsigned int GridDimX, unsigned int GridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, + unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, + void **KernelParams, void **Extra); +static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; + +typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); +static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr; + +typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t); +static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr; + +typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr); +static CuMemFreeFcnTy *CuMemFreeFcnPtr; + +typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule); +static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr; + +typedef CUresult CUDAAPI CuProfilerStopFcnTy(); +static CuProfilerStopFcnTy *CuProfilerStopFcnPtr; + +typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext); +static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr; + +typedef CUresult CUDAAPI CuInitFcnTy(unsigned int); +static CuInitFcnTy *CuInitFcnPtr; + +typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *); +static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr; + +typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice); +static CuCtxCreateFcnTy *CuCtxCreateFcnPtr; + +typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *); +static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr; + +typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int); +static CuDeviceGetFcnTy *CuDeviceGetFcnPtr; + +typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *, + unsigned int, CUjit_option *, + void **); +static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; + +typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module, + const void *Image); +static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; + +typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, + const char *); +static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr; + +typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice); +static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr; + +typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); +static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; + +typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State, + CUjitInputType Type, void *Data, + size_t Size, const char *Name, + unsigned int NumOptions, + CUjit_option *Options, + void **OptionValues); +static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; + +typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions, + CUjit_option *Options, + void **OptionValues, + CUlinkState *StateOut); +static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; + +typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut, + size_t *SizeOut); +static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; + +typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State); +static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; + +typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy(); +static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr; + +/* Type-defines of function pointer ot CUDA runtime APIs. */ +typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); +static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; + +static void *getAPIHandleCUDA(void *Handle, const char *FuncName) { + char *Err; + void *FuncPtr; + dlerror(); + FuncPtr = dlsym(Handle, FuncName); + if ((Err = dlerror()) != 0) { + fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err); + return 0; + } + return FuncPtr; +} + +static int initialDeviceAPILibrariesCUDA() { + HandleCuda = dlopen("libcuda.so", RTLD_LAZY); + if (!HandleCuda) { + fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + return 0; + } + + HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); + if (!HandleCudaRT) { + fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + return 0; + } + + return 1; +} + +/* Get function pointer to CUDA Driver APIs. + * + * Note that compilers conforming to the ISO C standard are required to + * generate a warning if a conversion from a void * pointer to a function + * pointer is attempted as in the following statements. The warning + * of this kind of cast may not be emitted by clang and new versions of gcc + * as it is valid on POSIX 2008. For compilers required to generate a warning, + * we temporarily disable -Wpedantic, to avoid bloating the output with + * unnecessary warnings. + * + * Reference: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +static int initialDeviceAPIsCUDA() { + if (initialDeviceAPILibrariesCUDA() == 0) + return 0; + + CuLaunchKernelFcnPtr = + (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); + + CuMemAllocFcnPtr = + (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); + + CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuMemAllocManaged"); + + CuMemFreeFcnPtr = + (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); + + CuMemcpyDtoHFcnPtr = + (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); + + CuMemcpyHtoDFcnPtr = + (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); + + CuModuleUnloadFcnPtr = + (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload"); + + CuProfilerStopFcnPtr = + (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop"); + + CuCtxDestroyFcnPtr = + (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy"); + + CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit"); + + CuDeviceGetCountFcnPtr = + (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount"); + + CuDeviceGetFcnPtr = + (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet"); + + CuCtxCreateFcnPtr = + (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); + + CuCtxGetCurrentFcnPtr = + (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent"); + + CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuModuleLoadDataEx"); + + CuModuleLoadDataFcnPtr = + (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData"); + + CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuModuleGetFunction"); + + CuDeviceComputeCapabilityFcnPtr = + (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuDeviceComputeCapability"); + + CuDeviceGetNameFcnPtr = + (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName"); + + CuLinkAddDataFcnPtr = + (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData"); + + CuLinkCreateFcnPtr = + (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate"); + + CuLinkCompleteFcnPtr = + (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete"); + + CuLinkDestroyFcnPtr = + (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy"); + + CuCtxSynchronizeFcnPtr = + (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize"); + + /* Get function pointer to CUDA Runtime APIs. */ + CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA( + HandleCudaRT, "cudaThreadSynchronize"); + + return 1; +} +#pragma GCC diagnostic pop + +static PollyGPUContext *initContextCUDA() { + dump_function(); + PollyGPUContext *Context; + CUdevice Device; + + int Major = 0, Minor = 0, DeviceID = 0; + char DeviceName[256]; + int DeviceCount = 0; + + static __thread PollyGPUContext *CurrentContext = NULL; + + if (CurrentContext) + return CurrentContext; + + /* Get API handles. */ + if (initialDeviceAPIsCUDA() == 0) { + fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n"); + exit(-1); + } + + if (CuInitFcnPtr(0) != CUDA_SUCCESS) { + fprintf(stderr, "Initializing the CUDA driver API failed.\n"); + exit(-1); + } + + /* Get number of devices that supports CUDA. */ + CuDeviceGetCountFcnPtr(&DeviceCount); + if (DeviceCount == 0) { + fprintf(stderr, "There is no device supporting CUDA.\n"); + exit(-1); + } + + CuDeviceGetFcnPtr(&Device, 0); + + /* Get compute capabilities and the device name. */ + CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device); + CuDeviceGetNameFcnPtr(DeviceName, 256, Device); + debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); + + /* Create context on the device. */ + Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); + if (Context == 0) { + fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); + exit(-1); + } + Context->Context = malloc(sizeof(CUDAContext)); + if (Context->Context == 0) { + fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n"); + exit(-1); + } + + // In cases where managed memory is used, it is quite likely that + // `cudaMallocManaged` / `polly_mallocManaged` was called before + // `polly_initContext` was called. + // + // If `polly_initContext` calls `CuCtxCreate` when there already was a + // pre-existing context created by the runtime API, this causes code running + // on P100 to hang. So, we query for a pre-existing context to try and use. + // If there is no pre-existing context, we create a new context + + // The possible pre-existing context from previous runtime API calls. + CUcontext MaybeRuntimeAPIContext; + if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxGetCurrent failed.\n"); + exit(-1); + } + + // There was no previous context, initialise it. + if (MaybeRuntimeAPIContext == NULL) { + if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, + Device) != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxCreateFcnPtr failed.\n"); + exit(-1); + } + } else { + ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext; + } + + if (CacheMode) + CurrentContext = Context; + + return Context; +} + +static void freeKernelCUDA(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (((CUDAKernel *)Kernel->Kernel)->CudaModule) + CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule); + + if (Kernel->Kernel) + free((CUDAKernel *)Kernel->Kernel); + + if (Kernel) + free(Kernel); +} + +static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, + const char *KernelName) { + dump_function(); + + static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; + static __thread int NextCacheItem = 0; + + for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { + // We exploit here the property that all Polly-ACC kernels are allocated + // as global constants, hence a pointer comparision is sufficient to + // determin equality. + if (KernelCache[i] && + ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) { + debug_print(" -> using cached kernel\n"); + return KernelCache[i]; + } + } + + PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + if (Function == 0) { + fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); + exit(-1); + } + Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel)); + if (Function->Kernel == 0) { + fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n"); + exit(-1); + } + + CUresult Res; + CUlinkState LState; + CUjit_option Options[6]; + void *OptionVals[6]; + float Walltime = 0; + unsigned long LogSize = 8192; + char ErrorLog[8192], InfoLog[8192]; + void *CuOut; + size_t OutSize; + + // Setup linker options + // Return walltime from JIT compilation + Options[0] = CU_JIT_WALL_TIME; + OptionVals[0] = (void *)&Walltime; + // Pass a buffer for info messages + Options[1] = CU_JIT_INFO_LOG_BUFFER; + OptionVals[1] = (void *)InfoLog; + // Pass the size of the info buffer + Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + OptionVals[2] = (void *)LogSize; + // Pass a buffer for error message + Options[3] = CU_JIT_ERROR_LOG_BUFFER; + OptionVals[3] = (void *)ErrorLog; + // Pass the size of the error buffer + Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + OptionVals[4] = (void *)LogSize; + // Make the linker verbose + Options[5] = CU_JIT_LOG_VERBOSE; + OptionVals[5] = (void *)1; + + memset(ErrorLog, 0, sizeof(ErrorLog)); + + CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); + Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer, + strlen(BinaryBuffer) + 1, 0, 0, 0, 0); + if (Res != CUDA_SUCCESS) { + fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); + exit(-1); + } + + Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize); + if (Res != CUDA_SUCCESS) { + fprintf(stderr, "Complete ptx linker step failed.\n"); + fprintf(stderr, "\n%s\n", ErrorLog); + exit(-1); + } + + debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, + InfoLog); + + Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule), + CuOut); + if (Res != CUDA_SUCCESS) { + fprintf(stderr, "Loading ptx assembly text failed.\n"); + exit(-1); + } + + Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda), + ((CUDAKernel *)Function->Kernel)->CudaModule, + KernelName); + if (Res != CUDA_SUCCESS) { + fprintf(stderr, "Loading kernel function failed.\n"); + exit(-1); + } + + CuLinkDestroyFcnPtr(LState); + + ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer; + + if (CacheMode) { + if (KernelCache[NextCacheItem]) + freeKernelCUDA(KernelCache[NextCacheItem]); + + KernelCache[NextCacheItem] = Function; + + NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; + } + + return Function; +} + +static void synchronizeDeviceCUDA() { + dump_function(); + if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { + fprintf(stderr, "Synchronizing device and host memory failed.\n"); + exit(-1); + } +} + +static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda; + CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); +} + +static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda, + MemSize) != CUDA_SUCCESS) { + fprintf(stderr, "Copying results from device to host memory failed.\n"); + exit(-1); + } +} + +static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + unsigned GridDimZ = 1; + unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; + CUstream Stream = 0; + void **Extra = 0; + + CUresult Res; + Res = + CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX, + GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ, + SharedMemBytes, Stream, Parameters, Extra); + if (Res != CUDA_SUCCESS) { + fprintf(stderr, "Launching CUDA kernel failed.\n"); + exit(-1); + } +} + +// Maximum number of managed memory pointers. +#define DEFAULT_MAX_POINTERS 4000 +// For the rationale behing a list of free pointers, see `polly_freeManaged`. +void **g_managedptrs; +unsigned long long g_nmanagedptrs = 0; +unsigned long long g_maxmanagedptrs = 0; + +__attribute__((constructor)) static void initManagedPtrsBuffer() { + g_maxmanagedptrs = DEFAULT_MAX_POINTERS; + const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS"); + if (maxManagedPointersString) + g_maxmanagedptrs = atoll(maxManagedPointersString); + + g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs); +} + +// Add a pointer as being allocated by cuMallocManaged +void addManagedPtr(void *mem) { + assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!"); + assert(g_nmanagedptrs < g_maxmanagedptrs && + "We have hit the maximum number of " + "managed pointers allowed. Set the " + "POLLY_MAX_MANAGED_POINTERS environment variable. "); + g_managedptrs[g_nmanagedptrs++] = mem; +} + +int isManagedPtr(void *mem) { + for (unsigned long long i = 0; i < g_nmanagedptrs; i++) { + if (g_managedptrs[i] == mem) + return 1; + } + return 0; +} + +void freeManagedCUDA(void *mem) { + dump_function(); + + // In a real-world program this was used (COSMO), there were more `free` + // calls in the original source than `malloc` calls. Hence, replacing all + // `free`s with `cudaFree` does not work, since we would try to free + // 'illegal' memory. + // As a quick fix, we keep a free list and check if `mem` is a managed memory + // pointer. If it is, we call `cudaFree`. + // If not, we pass it along to the underlying allocator. + // This is a hack, and can be removed if the underlying issue is fixed. + if (isManagedPtr(mem)) { + if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) { + fprintf(stderr, "cudaFree failed.\n"); + exit(-1); + } + return; + } else { + free(mem); + } +} + +void *mallocManagedCUDA(size_t size) { + // Note: [Size 0 allocations] + // Sometimes, some runtime computation of size could create a size of 0 + // for an allocation. In these cases, we do not wish to fail. + // The CUDA API fails on size 0 allocations. + // So, we allocate size a minimum of size 1. + if (!size && DebugMode) + fprintf(stderr, "cudaMallocManaged called with size 0. " + "Promoting to size 1"); + size = max(size, 1); + PollyGPUContext *_ = polly_initContextCUDA(); + assert(_ && "polly_initContextCUDA failed"); + + void *newMemPtr; + const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size, + CU_MEM_ATTACH_GLOBAL); + if (Res != CUDA_SUCCESS) { + fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); + exit(-1); + } + addManagedPtr(newMemPtr); + return newMemPtr; +} + +static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { + dump_function(); + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); + free(DevPtr); + free(Allocation); +} + +static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) { + if (!MemSize && DebugMode) + fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. " + "Promoting to size 1"); + // see: [Size 0 allocations] + MemSize = max(MemSize, 1); + dump_function(); + + PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); + if (DevData == 0) { + fprintf(stderr, + "Allocate memory for GPU device memory pointer failed." + " Line: %d | Size: %ld\n", + __LINE__, MemSize); + exit(-1); + } + DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stderr, + "Allocate memory for GPU device memory pointer failed." + " Line: %d | Size: %ld\n", + __LINE__, MemSize); + exit(-1); + } + + CUresult Res = + CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize); + + if (Res != CUDA_SUCCESS) { + fprintf(stderr, + "Allocate memory for GPU device memory pointer failed." + " Line: %d | Size: %ld\n", + __LINE__, MemSize); + exit(-1); + } + + return DevData; +} + +static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) { + dump_function(); + + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->Cuda; +} + +static void freeContextCUDA(PollyGPUContext *Context) { + dump_function(); + + CUDAContext *Ctx = (CUDAContext *)Context->Context; + if (Ctx->Cuda) { + CuProfilerStopFcnPtr(); + CuCtxDestroyFcnPtr(Ctx->Cuda); + free(Ctx); + free(Context); + } + + dlclose(HandleCuda); + dlclose(HandleCudaRT); +} + +#endif /* HAS_LIBCUDART */ +/******************************************************************************/ +/* API */ +/******************************************************************************/ + +PollyGPUContext *polly_initContext() { + DebugMode = getenv("POLLY_DEBUG") != 0; + CacheMode = getenv("POLLY_NOCACHE") == 0; + + dump_function(); + + PollyGPUContext *Context; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + Context = initContextCUDA(); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + Context = initContextCL(); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return Context; +} + +void polly_freeKernel(PollyGPUFunction *Kernel) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + freeKernelCUDA(Kernel); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + freeKernelCL(Kernel); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, + const char *KernelName) { + dump_function(); + + PollyGPUFunction *Function; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + Function = getKernelCUDA(BinaryBuffer, KernelName); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + Function = getKernelCL(BinaryBuffer, KernelName); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return Function; +} + +void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + copyFromHostToDeviceCUDA(HostData, DevData, MemSize); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + copyFromHostToDeviceCL(HostData, DevData, MemSize); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + copyFromDeviceToHostCUDA(DevData, HostData, MemSize); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + copyFromDeviceToHostCL(DevData, HostData, MemSize); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, + Parameters); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + freeDeviceMemoryCUDA(Allocation); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + freeDeviceMemoryCL(Allocation); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { + dump_function(); + + PollyGPUDevicePtr *DevData; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + DevData = allocateMemoryForDeviceCUDA(MemSize); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + DevData = allocateMemoryForDeviceCL(MemSize); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return DevData; +} + +void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { + dump_function(); + + void *DevPtr; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + DevPtr = getDevicePtrCUDA(Allocation); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + DevPtr = getDevicePtrCL(Allocation); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return DevPtr; +} + +void polly_synchronizeDevice() { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + synchronizeDeviceCUDA(); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + synchronizeDeviceCL(); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_freeContext(PollyGPUContext *Context) { + dump_function(); + + if (CacheMode) + return; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + freeContextCUDA(Context); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + freeContextCL(Context); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_freeManaged(void *mem) { + dump_function(); + +#ifdef HAS_LIBCUDART + freeManagedCUDA(mem); +#else + fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n"); + exit(-1); +#endif +} + +void *polly_mallocManaged(size_t size) { + dump_function(); + +#ifdef HAS_LIBCUDART + return mallocManagedCUDA(size); +#else + fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n"); + exit(-1); +#endif +} + +/* Initialize GPUJIT with CUDA as runtime library. */ +PollyGPUContext *polly_initContextCUDA() { +#ifdef HAS_LIBCUDART + Runtime = RUNTIME_CUDA; + return polly_initContext(); +#else + fprintf(stderr, "GPU Runtime was built without CUDA support.\n"); + exit(-1); +#endif /* HAS_LIBCUDART */ +} + +/* Initialize GPUJIT with OpenCL as runtime library. */ +PollyGPUContext *polly_initContextCL() { +#ifdef HAS_LIBOPENCL + Runtime = RUNTIME_CL; + return polly_initContext(); +#else + fprintf(stderr, "GPU Runtime was built without OpenCL support.\n"); + exit(-1); +#endif /* HAS_LIBOPENCL */ +} diff --git a/polly/tools/GPURuntime/LICENSE.TXT b/polly/tools/GPURuntime/LICENSE.TXT new file mode 100644 --- /dev/null +++ b/polly/tools/GPURuntime/LICENSE.TXT @@ -0,0 +1,310 @@ +============================================================================== +The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: +============================================================================== + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +---- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + +============================================================================== +Software from third parties included in the LLVM Project: +============================================================================== +The LLVM Project contains third party software which is under different license +terms. All such code will be identified clearly using at least one of two +mechanisms: +1) It will be in a separate directory tree with its own `LICENSE.txt` or + `LICENSE` file at the top containing the specific license and restrictions + which apply to that software, or +2) It will contain specific license and restriction terms at the top of every + file. + +============================================================================== +Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): +============================================================================== + +The GPURuntime library is dual licensed under both the University of Illinois +"BSD-Like" license and the MIT license. As a user of this code you may choose +to use it under either license. As a contributor, you agree to allow your code +to be used under both. + +Full text of the relevant licenses is included below. + +============================================================================== + +University of Illinois/NCSA +Open Source License + +Copyright (c) 2009-2019 by the contributors listed in CREDITS.TXT + +All rights reserved. + +Developed by: + + Polly Team + + http://polly.llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + +============================================================================== + +Copyright (c) 2009-2016 by the contributors listed in CREDITS.TXT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/polly/www/documentation/gpgpucodegen.html b/polly/www/documentation/gpgpucodegen.html new file mode 100644 --- /dev/null +++ b/polly/www/documentation/gpgpucodegen.html @@ -0,0 +1,229 @@ + + + + + + Polly - GPGPU Code Generation + + + + + + + diff --git a/polly/www/index.html b/polly/www/index.html --- a/polly/www/index.html +++ b/polly/www/index.html @@ -28,7 +28,8 @@ on integer polyhedra to analyze and optimize the memory access pattern of a program. We currently perform classical loop transformations, especially tiling and loop fusion to improve data-locality. Polly can also exploit - OpenMP level parallelism, expose SIMDization opportunities.

+ OpenMP level parallelism, expose SIMDization opportunities. Work has also be + done in the area of automatic GPU code generation.

For many users, however, it's not the existing optimizations in Polly that are of most interest, but the new analyses and optimizations enabled by the Polly diff --git a/polly/www/todo.html b/polly/www/todo.html --- a/polly/www/todo.html +++ b/polly/www/todo.html @@ -342,6 +342,14 @@ Johannes GPGPU Code +Generation +in progress + +Yabin + + + Allow optimizers to change memory access functions Done
+ +
+ +

Polly - GPGPU Code Generation

+ +

WARNING: This project was part of the Google Summer of Code 2012. +It is currently not finished, but it is in the design and implementation stage. +The ideas/plans described here may not yet be implemented in Polly and may +change later on.

+ +This project adds GPGPU code generation feature to Polly. + +

Objective

+

The overall objective of this GSoC project is to create a preliminary + implementation of GPGPU code generation for Polly. With this addition, users + can parallelize some perfectly nested loops with Polly to execute on a + heterogeneous platform, composed of CPU and GPU.

+

There are several successful projects about automatic source-to-source gpu + code transformation. C-to-CUDA[1] uses the standard Pluto algorithms for + computing an affine schedule and then applies a wavefront transformation to + obtain one sequential and n-1 parallel loops. The parallel loops are then + mapped onto the blocks and threads of GPU. PPCG[2] introduces some advanced + algorithms which can expose much more parallelism than other methods . And It + also introduces affine partition heuristics and code generation algorithms + for locality enhancement in the registers and shared memory.

+

Since automatic GPGPU code generation is quite a complex problem and what we + target is a low-level intermediate representation, LLVM IR, rather than a + high-level language source, it is important for us to set a proper objective + as a start step to give a complete solution to GPGPU code generation for LLVM + IR.

+

Firstly, we plan to target two kinds of relatively simple test cases. One is + comprised of pure parallel and perfectly nested loops, like the following + code.

+
+parfor(int i=0 to M)
+  parfor(int j=0 to N)
+    LoopBody(i, j);
+
+

Another one is that all the loops in it are parallel except the inner-most + one, just like this:

+
+parfor(int i=0 to M)
+  parfor(int j=0 to N)
+    non-parfor(int k=0 to K)
+      LoopBody(i, j, k);
+
+

The LoopBody part should be limited to instructions or functions calls + (intrinsics) which can be handled by LLVM's NVPTX backend.

+

On the other hand, we focus on building a preliminary and scalable framework + of GPGPU code generation for polly. Thus we plan to employ relatively simple + tiling and mapping algorithms and optimize them later.

+

Work Flow

+

GPGPU Code Generation In General

+

C-to-CUDA[1] and PPCG[2] propose similar steps to solve the automatic GPGPU + code generation problem.

+
  • Look for parallel loops.
  • +
  • Create a polyhedral model from the loops.
  • +
  • Tile and map the loops to GPU blocks and threads.
  • +
  • Determine where to place the data.
  • +

    What has been done in Polly

    +

    Polly has implemented the 1st, 2nd and part of the 3rd of the above steps and + many other analysis and transformation passes.

    +

    What to do in Polly

    +

    Unlike many source-to-source optimizers such as C-to-CUDA and PPCG, Polly is + a low-level optimizer, which means we can't use a source-level compiler + (e.g. NVCC) to generate the final assembly for the device. We need manually + insert device driver API calls to execute the generated kernel assembly + text.

    +

    In this project, we assume that the device driver library has provided an + interface to launch kernels in the form of assembly text. Fortunately, most + of the mainstream GPU vendors provide such a feature in their products (see + ptxjit of NVIDIA GPUs and CAL of AMD GPUs). Generally speaking, what we + are going to do in Polly is:

    +
  • Find a way to tile the parallel loops.
  • +
  • Find a way to extract the loop body and transform it into thread-centric + parallel code.
  • +
  • Find a way to store/load the thread-centric code into/from a device module. +
  • Find a way to pass the target machine information and generate code of the + device module for the target. +
  • Find a way to map the tiled loop to GPU blocks and threads.
  • +
  • Find a way to insert CUDA synchronization operations on-demand. +
  • Find a way to generate the memory copy operations between a host and a + device.
  • +
  • Implement/Wrap a runtime library to serve as the execution engine for the + generated device code.
  • + +

    The Work Flow

    +

    In this section, we assume that the host cpu is X86 and the device is NVIDIA + CUDA-compatible. we will use the following test case to describe our work + flow.

    +
    +for(i = 0; i < 128; i++)
    +      for(j = 0; j < 128; j++)
    +              A[i][j] = i*128 + j;
    +
    +

    The work flow of our code generator is as follows.

    +

    1.We first use Polly's jscop file importer to get a wanted 4-level parallel + tiled code.

    +The "schedule" part of the pre-optimization jscop file is as the following: +
    +"schedule" : "{ Stmt_for_body3[i0, i1] -> schedule[0, i0, 0, i1, 0] }"
    +
    +The jscop file describing the tiling transformation is: +
    +"schedule" : "{ Stmt_for_body3[i0, i1] -> schedule[0, o0, o1, o2, o3]:
    +              o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and
    +              o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and
    +              i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
    +
    +We can test the schedule with the following command line. +
    +opt -load /path/to/polly/build/LLVMPolly.so -basic-aa -polly-import-jscop
    +    -polly-ast -analyze -q ./test.ll
    +    -polly-import-jscop-postfix=transformed+gpu
    +
    +The output of this schedule is: +
    +for (c2=0;c2<=7;c2++) {
    +  for (c3=0;c3<=15;c3++) {
    +    for (c4=0;c4<=7;c4++) {
    +      for (c5=0;c5<=15;c5++) {
    +        Stmt_for_body3(16*c2+c3,16*c4+c5);
    +      }
    +    }
    +  }
    +}
    +
    +Now we get a 4-dimensional parallel loops with a single SCoP statement in it. +

    2.We then extract the loop body (or the inner-most non-parallel loop) into a + LLVM function, tagging it with PTX_Kernel call convention.

    +

    3.We extract the PTX_kernel function into a temporary module, set the target + triple (e.g. nvptx64-unknown-linux) for the module, transform the temporary + module into a string, store it in the original module and erase the + PTX_kernel function.

    +

    4.We replace the loops with their GPGPU counterpart. The GPGPU part of code + is composed of a call to the llvm.codegen intrinsic and function calls to our + GPU runtime library.

    +

    5.Finally, we generate the executable program with llc or run the + optimized LLVM IRs with a JIT compiler like lli.

    +

    Usage

    +

    1. Apply the llvm.codegen intrinsic patch to LLVM code base.

    +
    cd /path/to/llvm/source
    +git am /path/to/polly/source/utils/0001-Add-llvm.codegen-intrinsic.patch
    +

    2. Build the test case.

    +
    /path/to/polly/source/test/create_ll.sh test.c
    +

    3. Get and edit the jscop file (take function "gpu_codegen" as an example). +

    +
    opt -load /path/to/polly/build/lib/LLVMPolly.so -basic-aa
    +    -polly-export-jscop ./test.ll
    +cp gpu_codegen___%for.cond---%for.end8.jscop
    +   gpu_codegen___%for.cond---%for.end8.jscop.transformed+gpu
    +vi gpu_codegen___%for.cond---%for.end8.jscop.transformed+gpu
    +

    (Please refer to section "The Work Flow" on how to edit the "schedule" + part of a statement)

    +

    4. Optimize the code with GPGPU code generation.

    +
    opt -load /path/to/polly/build/lib/LLVMPolly.so -basic-aa
    +    -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu
    +    -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen ./test.ll -S
    +    -o test.gpued.ll
    +

    5. Build the final assembly and executable.

    +
    llc test.gpued.ll -o test.s
    +gcc test.s -lGPURuntime -o test
    +

    (Please make sure that LD_LIBRARY_PATH is set properly so that + /path/to/polly/build/lib/libGPURuntime.so is visible to gcc.)

    +

    TODO List

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Tasks Status Owner
    Tiling the Parallel Loops with An External Jscop FileOpen, In DesignYabin Hu
    GPU Runtime Library ImplementationCoding Finished, In Reviewing
    llvm.codegen Intrinsic ImplementationCoding Finished, To Be Reviewed
    Code Generation For Host50% Done
    + +

    References

    +
  • +Automatic C-to-CUDA Code Generation for Affine Programs.
    + Muthu Manikandan Baskaran, J. Ramanujam and P. Sadayappan.
    + International Conference on Compiler Construction (CC) 2010.
    +
  • +
  • PPCG Project
    +
    http://freecode.com/projects/ppcg +
  • +
  • +Where is the Data? Why You Cannot Debate GPU vs. CPU Performance Without the + Answer.
    + Chris Gregg and Kim Hazelwood
    + International Symposium on Performance Analysis of Systems and Software + (ISPASS) 2011. +
  • +

    +
    +