diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt --- a/polly/CMakeLists.txt +++ b/polly/CMakeLists.txt @@ -85,31 +85,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF) -set(GPU_CODEGEN FALSE) -if (POLLY_ENABLE_GPGPU_CODEGEN) - # Do not require CUDA/OpenCL, as GPU code generation test cases can be run - # without a CUDA/OpenCL library. - if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) - FIND_PACKAGE(CUDA) - FIND_PACKAGE(OpenCL) - set(GPU_CODEGEN TRUE) - else() - message(WARNING "The LLVM NVPTX target is required for GPU code generation") - endif() -endif(POLLY_ENABLE_GPGPU_CODEGEN) - - -# Support GPGPU code generation if the library is available. -if (CUDA_FOUND) - add_definitions(-DHAS_LIBCUDART) - INCLUDE_DIRECTORIES( ${CUDA_INCLUDE_DIRS} ) -endif(CUDA_FOUND) -if (OpenCL_FOUND) - add_definitions(-DHAS_LIBOPENCL) - INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) -endif(OpenCL_FOUND) - option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) find_package(ISL MODULE REQUIRED) @@ -155,7 +130,6 @@ if (POLLY_GTEST_AVAIL) add_subdirectory(unittests) endif () -add_subdirectory(tools) add_subdirectory(cmake) # TODO: docs. diff --git a/polly/cmake/CMakeLists.txt b/polly/cmake/CMakeLists.txt --- a/polly/cmake/CMakeLists.txt +++ b/polly/cmake/CMakeLists.txt @@ -27,9 +27,6 @@ # LLVMPolly is a dummy target on Win or if PIC code is disabled. list(APPEND POLLY_CONFIG_EXPORTED_TARGETS LLVMPolly) endif() -if (POLLY_ENABLE_GPGPU_CODEGEN) - list(APPEND POLLY_CONFIG_EXPORTED_TARGETS PollyPPCG) -endif() # Get the target type for every exported target foreach(tgt IN LISTS POLLY_CONFIG_EXPORTED_TARGETS) diff --git a/polly/cmake/PollyConfig.cmake.in b/polly/cmake/PollyConfig.cmake.in --- a/polly/cmake/PollyConfig.cmake.in +++ b/polly/cmake/PollyConfig.cmake.in @@ -8,7 +8,6 @@ set(Polly_CMAKE_DIR ${CMAKE_CURRENT_LIST_DIR}) set(Polly_BUNDLED_ISL @POLLY_BUNDLED_ISL@) -set(Polly_ENABLE_GPGPU_CODEGEN @POLLY_ENABLE_GPGPU_CODEGEN@) set(Polly_DEFINITIONS ${LLVM_DEFINITIONS}) set(Polly_INCLUDE_DIRS @POLLY_CONFIG_INCLUDE_DIRS@ ${LLVM_INCLUDE_DIRS}) @@ -19,17 +18,9 @@ # Imported Targets: @ISL_CONFIG_CODE@ -if (Polly_ENABLE_GPGPU_CODEGEN AND NOT TARGET PollyPPCG) - add_library(PollyPPCG @POLLY_CONFIG_TARGET_PollyPPCG_TYPE@ IMPORTED) - set_property(TARGET PollyPPCG PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@) -endif() - if (NOT TARGET Polly) add_library(Polly @POLLY_CONFIG_TARGET_Polly_TYPE@ IMPORTED) set_property(TARGET Polly PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@) - if (Polly_ENABLE_GPGPU_CODEGEN) - set_property(TARGET Polly APPEND PROPERTY INTERFACE_LINK_LIBRARIES PollyPPCG) - endif() endif() if (NOT TARGET LLVMPolly) diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst --- a/polly/docs/ReleaseNotes.rst +++ b/polly/docs/ReleaseNotes.rst @@ -21,3 +21,5 @@ In the future we hope that Polly can collaborate better with LoopVectorize, like Polly marking a loop is safe to vectorize with a specific simd width, instead of replicating its functionality. + +- Polly-ACC has been removed. diff --git a/polly/include/polly/CodeGen/PPCGCodeGeneration.h b/polly/include/polly/CodeGen/PPCGCodeGeneration.h deleted file mode 100644 --- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h +++ /dev/null @@ -1,33 +0,0 @@ -//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Take a scop created by ScopInfo and map it to GPU code using the ppcg -// GPU mapping strategy. -// -//===----------------------------------------------------------------------===// - -#ifndef POLLY_PPCGCODEGENERATION_H -#define POLLY_PPCGCODEGENERATION_H - -/// The GPU Architecture to target. -enum GPUArch { NVPTX64, SPIR32, SPIR64 }; - -/// The GPU Runtime implementation to use. -enum GPURuntime { CUDA, OpenCL }; - -namespace polly { -extern bool PollyManagedMemory; - -/// Use for pass instantiation defaults. -/// @{ -extern GPURuntime GPURuntimeChoice; -extern GPUArch GPUArchChoice; -/// @} -} // namespace polly - -#endif // POLLY_PPCGCODEGENERATION_H diff --git a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h --- a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h +++ b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h @@ -30,24 +30,20 @@ struct RuntimeDebugBuilder { /// Generate a constant string into the builder's llvm::Module which can be - /// passed to createGPUPrinter() or createGPUPrinter(). + /// passed to createCPUPrinter(). /// /// @param Builder The builder used to emit the printer calls. /// @param Str The string to be printed. /// @return A global containing @p Str. static llvm::Value *getPrintableString(PollyIRBuilder &Builder, - llvm::StringRef Str) { - // TODO: Get rid of magic number 4. It it NVPTX's constant address space and - // works on X86 (CPU) only because its backend ignores the address space. - return Builder.CreateGlobalStringPtr(Str, "", 4); - } + llvm::StringRef Str); /// Return whether an llvm::Value of the type @p Ty is printable for /// debugging. /// - /// That is, whether such a value can be passed to createGPUPrinter() or - /// createGPUPrinter() to be dumped as runtime. If false is returned, those + /// That is, whether such a value can be passed to createGPUPrinter() + /// to be dumped as runtime. If false is returned, those /// functions will fail. static bool isPrintable(llvm::Type *Ty); @@ -64,62 +60,41 @@ template static void createCPUPrinter(PollyIRBuilder &Builder, Args... args) { std::vector Vector; - createPrinter(Builder, /* CPU */ false, Vector, args...); - } - - /// Print a set of LLVM-IR Values or StringRefs on an NVIDIA GPU. - /// - /// This function emits a call to vprintf that will print the given - /// arguments from within a kernel thread. It is useful for debugging - /// CUDA program kernels. All arguments given in this list will be - /// automatically concatenated and the resulting string will be printed - /// atomically. We also support ArrayRef arguments, which can be used to - /// provide for example a list of thread-id values. - /// - /// @param Builder The builder used to emit the printer calls. - /// @param Args The list of values to print. - template - static void createGPUPrinter(PollyIRBuilder &Builder, Args... args) { - std::vector Vector; - createPrinter(Builder, /* GPU */ true, Vector, args...); + createPrinter(Builder, Vector, args...); } private: /// Handle Values. template - static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, + static void createPrinter(PollyIRBuilder &Builder, std::vector &Values, llvm::Value *Value, Args... args) { Values.push_back(Value); - createPrinter(Builder, UseGPU, Values, args...); + createPrinter(Builder, Values, args...); } /// Handle StringRefs. template - static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, + static void createPrinter(PollyIRBuilder &Builder, std::vector &Values, llvm::StringRef String, Args... args) { Values.push_back(getPrintableString(Builder, String)); - createPrinter(Builder, UseGPU, Values, args...); + createPrinter(Builder, Values, args...); } /// Handle ArrayRefs. template - static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, + static void createPrinter(PollyIRBuilder &Builder, std::vector &Values, llvm::ArrayRef Array, Args... args) { Values.insert(Values.end(), Array.begin(), Array.end()); - createPrinter(Builder, UseGPU, Values, args...); + createPrinter(Builder, Values, args...); } /// Print a list of Values. - static void createPrinter(PollyIRBuilder &Builder, bool UseGPU, + static void createPrinter(PollyIRBuilder &Builder, llvm::ArrayRef Values); - /// Print a list of Values on a GPU. - static void createGPUPrinterT(PollyIRBuilder &Builder, - llvm::ArrayRef Values); - /// Print a list of Values on a CPU. static void createCPUPrinterT(PollyIRBuilder &Builder, llvm::ArrayRef Values); @@ -145,22 +120,6 @@ /// /// @parma Builder The builder used to insert the code. static void createFlush(PollyIRBuilder &Builder); - - /// Get (and possibly insert) a NVIDIA address space cast call. - static llvm::Function *getAddressSpaceCast(PollyIRBuilder &Builder, - unsigned Src, unsigned Dst, - unsigned SrcBits = 8, - unsigned DstBits = 8); - - /// Get identifiers that describe the currently executed GPU thread. - /// - /// The result will be a vector that if passed to the GPU printer will result - /// into a string (initialized to values corresponding to the printing - /// thread): - /// - /// "> block-id: bidx bid1y bidz | thread-id: tidx tidy tidz " - static std::vector - getGPUThreadIdentifiers(PollyIRBuilder &Builder); }; } // namespace polly diff --git a/polly/include/polly/Config/config.h.cmake b/polly/include/polly/Config/config.h.cmake --- a/polly/include/polly/Config/config.h.cmake +++ b/polly/include/polly/Config/config.h.cmake @@ -12,7 +12,4 @@ #ifndef POLLY_CONFIG_H #define POLLY_CONFIG_H -#cmakedefine CUDA_FOUND -#cmakedefine GPU_CODEGEN - #endif diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h --- a/polly/include/polly/LinkAllPasses.h +++ b/polly/include/polly/LinkAllPasses.h @@ -14,7 +14,6 @@ #ifndef POLLY_LINKALLPASSES_H #define POLLY_LINKALLPASSES_H -#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/Config/config.h" #include "polly/Support/DumpFunctionPass.h" #include "polly/Support/DumpModulePass.h" @@ -54,14 +53,6 @@ llvm::Pass *createIslAstInfoWrapperPassPass(); llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS); llvm::Pass *createCodeGenerationPass(); -#ifdef GPU_CODEGEN -llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64, - GPURuntime Runtime = GPURuntime::CUDA); - -llvm::Pass * -createManagedMemoryRewritePassPass(GPUArch Arch = GPUArch::NVPTX64, - GPURuntime Runtime = GPURuntime::CUDA); -#endif llvm::Pass *createIslScheduleOptimizerWrapperPass(); llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS); llvm::Pass *createFlattenSchedulePass(); @@ -113,10 +104,6 @@ polly::createIslAstInfoWrapperPassPass(); polly::createIslAstInfoPrinterLegacyPass(llvm::outs()); polly::createCodeGenerationPass(); -#ifdef GPU_CODEGEN - polly::createPPCGCodeGenerationPass(); - polly::createManagedMemoryRewritePassPass(); -#endif polly::createIslScheduleOptimizerWrapperPass(); polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs()); polly::createMaximalStaticExpansionPass(); @@ -156,10 +143,6 @@ void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &); void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &); void initializeCodeGenerationPass(llvm::PassRegistry &); -#ifdef GPU_CODEGEN -void initializePPCGCodeGenerationPass(llvm::PassRegistry &); -void initializeManagedMemoryRewritePassPass(llvm::PassRegistry &); -#endif void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &); void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &); void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &); diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -1684,9 +1684,6 @@ /// Number of copy statements. unsigned CopyStmtsNum = 0; - /// Flag to indicate if the Scop is to be skipped. - bool SkipScop = false; - using StmtSet = std::list; /// The statements in this Scop. @@ -2144,12 +2141,6 @@ /// Check if the SCoP has been optimized by the scheduler. bool isOptimized() const { return IsOptimized; } - /// Mark the SCoP to be skipped by ScopPass passes. - void markAsToBeSkipped() { SkipScop = true; } - - /// Check if the SCoP is to be skipped by ScopPass passes. - bool isToBeSkipped() const { return SkipScop; } - /// Return the ID of the Scop int getID() const { return ID; } diff --git a/polly/include/polly/Support/LinkGPURuntime.h b/polly/include/polly/Support/LinkGPURuntime.h deleted file mode 100644 --- a/polly/include/polly/Support/LinkGPURuntime.h +++ /dev/null @@ -1,42 +0,0 @@ -//===- Support/LinkGPURuntime.h -- Headerfile to help force-link GPURuntime =// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This header helps pull in libGPURuntime.so -// -//===----------------------------------------------------------------------===// -#ifndef POLLY_LINK_GPURUNTIME -#define POLLY_LINK_GPURUNTIME - -extern "C" { -#include "GPURuntime/GPUJIT.h" -} - -namespace polly { -struct ForceGPURuntimeLinking { - ForceGPURuntimeLinking() { - if (std::getenv("bar") != (char *)-1) - return; - // We must reference GPURuntime in such a way that compilers will not - // delete it all as dead code, even with whole program optimization, - // yet is effectively a NO-OP. As the compiler isn't smart enough - // to know that getenv() never returns -1, this will do the job. - polly_initContextCL(); - polly_initContextCUDA(); - polly_getKernel(nullptr, nullptr); - polly_freeKernel(nullptr); - polly_copyFromHostToDevice(nullptr, nullptr, 0); - polly_copyFromDeviceToHost(nullptr, nullptr, 0); - polly_synchronizeDevice(); - polly_launchKernel(nullptr, 0, 0, 0, 0, 0, nullptr); - polly_freeDeviceMemory(nullptr); - polly_freeContext(nullptr); - polly_synchronizeDevice(); - } -} structure; -} // namespace polly -#endif diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt --- a/polly/lib/CMakeLists.txt +++ b/polly/lib/CMakeLists.txt @@ -6,13 +6,6 @@ CodeGen/IslNodeBuilder.cpp CodeGen/CodeGeneration.cpp) -if (GPU_CODEGEN) - set (GPGPU_CODEGEN_FILES - CodeGen/PPCGCodeGeneration.cpp - CodeGen/ManagedMemoryRewrite.cpp - ) -endif (GPU_CODEGEN) - # Compile ISL into a separate library. add_subdirectory(External) @@ -44,12 +37,6 @@ Vectorize ) -# Polly-ACC requires the NVPTX backend to work. Ask LLVM about its libraries. -if (GPU_CODEGEN) - # This call emits an error if they NVPTX backend is not enable. - list(APPEND POLLY_COMPONENTS NVPTX) -endif () - # Use an object-library to add the same files to multiple libs without requiring # the sources them to be recompiled for each of them. add_llvm_pass_plugin(Polly @@ -73,7 +60,6 @@ CodeGen/Utils.cpp CodeGen/RuntimeDebugBuilder.cpp CodeGen/PerfMonitor.cpp - ${GPGPU_CODEGEN_FILES} Exchange/JSONExporter.cpp Support/GICHelper.cpp Support/SCEVAffinator.cpp @@ -127,16 +113,6 @@ ${ISL_TARGET} ) -# Additional dependencies for Polly-ACC. -if (GPU_CODEGEN) - target_link_libraries(Polly PUBLIC PollyPPCG) -endif () - -if (NOT LLVM_LINK_LLVM_DYLIB AND NOT LLVM_POLLY_LINK_INTO_TOOLS) - # Polly-ACC requires the NVPTX target to be present in the executable it is linked to - set_property(TARGET bugpoint APPEND PROPERTY LINK_LIBRARIES LLVMTarget) -endif () - # Create a loadable module Polly.so that can be loaded using # LLVM's/clang's "-load" option. if (WIN32 OR NOT LLVM_ENABLE_PIC) @@ -150,19 +126,6 @@ $ ) - # Only add the dependencies that are not part of LLVM. The latter are assumed - # to be already available in the address space the module is loaded into. - # Adding them once more would have the effect that both copies try to register - # the same command line options, to which LLVM reacts with an error. - # If Polly-ACC is enabled, the NVPTX target is also expected to reside in the - # hosts. This is not the case for bugpoint. Use LLVM_POLLY_LINK_INTO_TOOLS=ON - # instead which will automatically resolve the additional dependencies by - # Polly. - target_link_libraries(LLVMPolly PUBLIC ${ISL_TARGET}) - if (GPU_CODEGEN) - target_link_libraries(LLVMPolly PUBLIC PollyPPCG) - endif () - set_target_properties(LLVMPolly PROPERTIES LINKER_LANGUAGE CXX diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp --- a/polly/lib/CodeGen/BlockGenerators.cpp +++ b/polly/lib/CodeGen/BlockGenerators.cpp @@ -238,14 +238,8 @@ Builder.Insert(NewInst); BBMap[Inst] = NewInst; - // When copying the instruction onto the Module meant for the GPU, - // debug metadata attached to an instruction causes all related - // metadata to be pulled into the Module. This includes the DICompileUnit, - // which will not be listed in llvm.dbg.cu of the Module since the Module - // doesn't contain one. This fails the verification of the Module and the - // subsequent generation of the ASM string. - if (NewInst->getModule() != Inst->getModule()) - NewInst->setDebugLoc(llvm::DebugLoc()); + assert(NewInst->getModule() == Inst->getModule() && + "Expecting instructions to be in the same module"); if (!NewInst->getType()->isVoidTy()) NewInst->setName("p_" + Inst->getName()); diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp --- a/polly/lib/CodeGen/CodeGeneration.cpp +++ b/polly/lib/CodeGen/CodeGeneration.cpp @@ -323,10 +323,6 @@ /// Generate LLVM-IR for the SCoP @p S. bool runOnScop(Scop &S) override { - // Skip SCoPs in case they're already code-generated by PPCGCodeGeneration. - if (S.isToBeSkipped()) - return false; - AI = &getAnalysis().getAI(); LI = &getAnalysis().getLoopInfo(); DT = &getAnalysis().getDomTree(); diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp --- a/polly/lib/CodeGen/IslAst.cpp +++ b/polly/lib/CodeGen/IslAst.cpp @@ -638,10 +638,6 @@ static std::unique_ptr runIslAst( Scop &Scop, function_ref GetDeps) { - // Skip SCoPs in case they're already handled by PPCGCodeGeneration. - if (Scop.isToBeSkipped()) - return {}; - ScopsProcessed++; const Dependences &D = GetDeps(Dependences::AL_Statement); diff --git a/polly/lib/CodeGen/ManagedMemoryRewrite.cpp b/polly/lib/CodeGen/ManagedMemoryRewrite.cpp deleted file mode 100644 --- a/polly/lib/CodeGen/ManagedMemoryRewrite.cpp +++ /dev/null @@ -1,427 +0,0 @@ -//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Take a module and rewrite: -// 1. `malloc` -> `polly_mallocManaged` -// 2. `free` -> `polly_freeManaged` -// 3. global arrays with initializers -> global arrays that are initialized -// with a constructor call to -// `polly_mallocManaged`. -// -//===----------------------------------------------------------------------===// - -#include "polly/CodeGen/IRBuilder.h" -#include "polly/CodeGen/PPCGCodeGeneration.h" -#include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" -#include "polly/Options.h" -#include "polly/ScopDetection.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/Analysis/CaptureTracking.h" -#include "llvm/InitializePasses.h" -#include "llvm/Transforms/Utils/ModuleUtils.h" - -using namespace llvm; -using namespace polly; - -static cl::opt RewriteAllocas( - "polly-acc-rewrite-allocas", - cl::desc( - "Ask the managed memory rewriter to also rewrite alloca instructions"), - cl::Hidden, cl::cat(PollyCategory)); - -static cl::opt IgnoreLinkageForGlobals( - "polly-acc-rewrite-ignore-linkage-for-globals", - cl::desc( - "By default, we only rewrite globals with internal linkage. This flag " - "enables rewriting of globals regardless of linkage"), - cl::Hidden, cl::cat(PollyCategory)); - -#define DEBUG_TYPE "polly-acc-rewrite-managed-memory" -namespace { - -static llvm::Function *getOrCreatePollyMallocManaged(Module &M) { - const char *Name = "polly_mallocManaged"; - Function *F = M.getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - PollyIRBuilder Builder(M.getContext()); - // TODO: How do I get `size_t`? I assume from DataLayout? - FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), - {Builder.getInt64Ty()}, false); - F = Function::Create(Ty, Linkage, Name, &M); - } - - return F; -} - -static llvm::Function *getOrCreatePollyFreeManaged(Module &M) { - const char *Name = "polly_freeManaged"; - Function *F = M.getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - PollyIRBuilder Builder(M.getContext()); - // TODO: How do I get `size_t`? I assume from DataLayout? - FunctionType *Ty = - FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false); - F = Function::Create(Ty, Linkage, Name, &M); - } - - return F; -} - -// Expand a constant expression `Cur`, which is used at instruction `Parent` -// at index `index`. -// Since a constant expression can expand to multiple instructions, store all -// the expands into a set called `Expands`. -// Note that this goes inorder on the constant expression tree. -// A * ((B * D) + C) -// will be processed with first A, then B * D, then B, then D, and then C. -// Though ConstantExprs are not treated as "trees" but as DAGs, since you can -// have something like this: -// * -// / \ -// \ / -// (D) -// -// For the purposes of this expansion, we expand the two occurences of D -// separately. Therefore, we expand the DAG into the tree: -// * -// / \ -// D D -// TODO: We don't _have_to do this, but this is the simplest solution. -// We can write a solution that keeps track of which constants have been -// already expanded. -static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder, - Instruction *Parent, int index, - SmallPtrSet &Expands) { - assert(Cur && "invalid constant expression passed"); - Instruction *I = Cur->getAsInstruction(); - assert(I && "unable to convert ConstantExpr to Instruction"); - - LLVM_DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur - << ") in Instruction: (" << *I << ")\n";); - - // Invalidate `Cur` so that no one after this point uses `Cur`. Rather, - // they should mutate `I`. - Cur = nullptr; - - Expands.insert(I); - Parent->setOperand(index, I); - - // The things that `Parent` uses (its operands) should be created - // before `Parent`. - Builder.SetInsertPoint(Parent); - Builder.Insert(I); - - for (unsigned i = 0; i < I->getNumOperands(); i++) { - Value *Op = I->getOperand(i); - assert(isa(Op) && "constant must have a constant operand"); - - if (ConstantExpr *CExprOp = dyn_cast(Op)) - expandConstantExpr(CExprOp, Builder, I, i, Expands); - } -} - -// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite -// `ConstantExpr`s that are used in the `Inst`. -// Note that `replaceAllUsesWith` is insufficient for this purpose because it -// does not rewrite values in `ConstantExpr`s. -static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal, - PollyIRBuilder &Builder) { - - // This contains a set of instructions in which OldVal must be replaced. - // We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s - // from `Inst`s arguments. - // We need to go through this process because `replaceAllUsesWith` does not - // actually edit `ConstantExpr`s. - SmallPtrSet InstsToVisit = {Inst}; - - // Expand all `ConstantExpr`s and place it in `InstsToVisit`. - for (unsigned i = 0; i < Inst->getNumOperands(); i++) { - Value *Operand = Inst->getOperand(i); - if (ConstantExpr *ValueConstExpr = dyn_cast(Operand)) - expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit); - } - - // Now visit each instruction and use `replaceUsesOfWith`. We know that - // will work because `I` cannot have any `ConstantExpr` within it. - for (Instruction *I : InstsToVisit) - I->replaceUsesOfWith(OldVal, NewVal); -} - -// Given a value `Current`, return all Instructions that may contain `Current` -// in an expression. -// We need this auxiliary function, because if we have a -// `Constant` that is a user of `V`, we need to recurse into the -// `Constant`s uses to gather the root instruction. -static void getInstructionUsersOfValue(Value *V, - SmallVector &Owners) { - if (auto *I = dyn_cast(V)) { - Owners.push_back(I); - } else { - // Anything that is a `User` must be a constant or an instruction. - auto *C = cast(V); - for (Use &CUse : C->uses()) - getInstructionUsersOfValue(CUse.getUser(), Owners); - } -} - -static void -replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array, - SmallPtrSet &ReplacedGlobals) { - // We only want arrays. - ArrayType *ArrayTy = dyn_cast(Array.getValueType()); - if (!ArrayTy) - return; - Type *ElemTy = ArrayTy->getElementType(); - PointerType *ElemPtrTy = ElemTy->getPointerTo(); - - // We only wish to replace arrays that are visible in the module they - // inhabit. Otherwise, our type edit from [T] to T* would be illegal across - // modules. - const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() || - Array.hasInternalLinkage() || - IgnoreLinkageForGlobals; - if (!OnlyVisibleInsideModule) { - LLVM_DEBUG( - dbgs() << "Not rewriting (" << Array - << ") to managed memory " - "because it could be visible externally. To force rewrite, " - "use -polly-acc-rewrite-ignore-linkage-for-globals.\n"); - return; - } - - if (!Array.hasInitializer() || - !isa(Array.getInitializer())) { - LLVM_DEBUG(dbgs() << "Not rewriting (" << Array - << ") to managed memory " - "because it has an initializer which is " - "not a zeroinitializer.\n"); - return; - } - - // At this point, we have committed to replacing this array. - ReplacedGlobals.insert(&Array); - - std::string NewName = Array.getName().str(); - NewName += ".toptr"; - GlobalVariable *ReplacementToArr = - cast(M.getOrInsertGlobal(NewName, ElemPtrTy)); - ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy)); - - Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M); - std::string FnName = Array.getName().str(); - FnName += ".constructor"; - PollyIRBuilder Builder(M.getContext()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); - const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Function *F = Function::Create(Ty, Linkage, FnName, &M); - BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F); - Builder.SetInsertPoint(Start); - - const uint64_t ArraySizeInt = DL.getTypeAllocSize(ArrayTy); - Value *ArraySize = Builder.getInt64(ArraySizeInt); - ArraySize->setName("array.size"); - - Value *AllocatedMemRaw = - Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw"); - Value *AllocatedMemTyped = - Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed"); - Builder.CreateStore(AllocatedMemTyped, ReplacementToArr); - Builder.CreateRetVoid(); - - const int Priority = 0; - appendToGlobalCtors(M, F, Priority, ReplacementToArr); - - SmallVector ArrayUserInstructions; - // Get all instructions that use array. We need to do this weird thing - // because `Constant`s that contain this array neeed to be expanded into - // instructions so that we can replace their parameters. `Constant`s cannot - // be edited easily, so we choose to convert all `Constant`s to - // `Instruction`s and handle all of the uses of `Array` uniformly. - for (Use &ArrayUse : Array.uses()) - getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions); - - for (Instruction *UserOfArrayInst : ArrayUserInstructions) { - - Builder.SetInsertPoint(UserOfArrayInst); - // ** -> * - Value *ArrPtrLoaded = - Builder.CreateLoad(ElemPtrTy, ReplacementToArr, "arrptr.load"); - // * -> [ty]* - Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast( - ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast"); - rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder); - } -} - -// We return all `allocas` that may need to be converted to a call to -// cudaMallocManaged. -static void getAllocasToBeManaged(Function &F, - SmallSet &Allocas) { - for (BasicBlock &BB : F) { - for (Instruction &I : BB) { - auto *Alloca = dyn_cast(&I); - if (!Alloca) - continue; - LLVM_DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: "); - - if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false, - /* StoreCaptures */ true)) { - Allocas.insert(Alloca); - LLVM_DEBUG(dbgs() << "YES (captured).\n"); - } else { - LLVM_DEBUG(dbgs() << "NO (not captured).\n"); - } - } - } -} - -static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca, - const DataLayout &DL) { - LLVM_DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n"); - Module *M = Alloca->getModule(); - assert(M && "Alloca does not have a module"); - - PollyIRBuilder Builder(M->getContext()); - Builder.SetInsertPoint(Alloca); - - Function *MallocManagedFn = - getOrCreatePollyMallocManaged(*Alloca->getModule()); - const uint64_t Size = DL.getTypeAllocSize(Alloca->getAllocatedType()); - Value *SizeVal = Builder.getInt64(Size); - Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal}); - Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType()); - - Function *F = Alloca->getFunction(); - assert(F && "Alloca has invalid function"); - - Bitcasted->takeName(Alloca); - Alloca->replaceAllUsesWith(Bitcasted); - Alloca->eraseFromParent(); - - for (BasicBlock &BB : *F) { - ReturnInst *Return = dyn_cast(BB.getTerminator()); - if (!Return) - continue; - Builder.SetInsertPoint(Return); - - Function *FreeManagedFn = getOrCreatePollyFreeManaged(*M); - Builder.CreateCall(FreeManagedFn, {RawManagedMem}); - } -} - -// Replace all uses of `Old` with `New`, even inside `ConstantExpr`. -// -// `replaceAllUsesWith` does replace values in `ConstantExpr`. This function -// actually does replace it in `ConstantExpr`. The caveat is that if there is -// a use that is *outside* a function (say, at global declarations), we fail. -// So, this is meant to be used on values which we know will only be used -// within functions. -// -// This process works by looking through the uses of `Old`. If it finds a -// `ConstantExpr`, it recursively looks for the owning instruction. -// Then, it expands all the `ConstantExpr` to instructions and replaces -// `Old` with `New` in the expanded instructions. -static void replaceAllUsesAndConstantUses(Value *Old, Value *New, - PollyIRBuilder &Builder) { - SmallVector UserInstructions; - // Get all instructions that use array. We need to do this weird thing - // because `Constant`s that contain this array neeed to be expanded into - // instructions so that we can replace their parameters. `Constant`s cannot - // be edited easily, so we choose to convert all `Constant`s to - // `Instruction`s and handle all of the uses of `Array` uniformly. - for (Use &ArrayUse : Old->uses()) - getInstructionUsersOfValue(ArrayUse.getUser(), UserInstructions); - - for (Instruction *I : UserInstructions) - rewriteOldValToNew(I, Old, New, Builder); -} - -class ManagedMemoryRewritePass final : public ModulePass { -public: - static char ID; - GPUArch Architecture; - GPURuntime Runtime; - - ManagedMemoryRewritePass() : ModulePass(ID) {} - bool runOnModule(Module &M) override { - const DataLayout &DL = M.getDataLayout(); - - Function *Malloc = M.getFunction("malloc"); - - if (Malloc) { - PollyIRBuilder Builder(M.getContext()); - Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M); - assert(PollyMallocManaged && "unable to create polly_mallocManaged"); - - replaceAllUsesAndConstantUses(Malloc, PollyMallocManaged, Builder); - Malloc->eraseFromParent(); - } - - Function *Free = M.getFunction("free"); - - if (Free) { - PollyIRBuilder Builder(M.getContext()); - Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M); - assert(PollyFreeManaged && "unable to create polly_freeManaged"); - - replaceAllUsesAndConstantUses(Free, PollyFreeManaged, Builder); - Free->eraseFromParent(); - } - - SmallPtrSet GlobalsToErase; - for (GlobalVariable &Global : M.globals()) - replaceGlobalArray(M, DL, Global, GlobalsToErase); - for (GlobalVariable *G : GlobalsToErase) - G->eraseFromParent(); - - // Rewrite allocas to cudaMallocs if we are asked to do so. - if (RewriteAllocas) { - SmallSet AllocasToBeManaged; - for (Function &F : M.functions()) - getAllocasToBeManaged(F, AllocasToBeManaged); - - for (AllocaInst *Alloca : AllocasToBeManaged) - rewriteAllocaAsManagedMemory(Alloca, DL); - } - - return true; - } -}; -} // namespace -char ManagedMemoryRewritePass::ID = 42; - -Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch, - GPURuntime Runtime) { - ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass(); - pass->Runtime = Runtime; - pass->Architecture = Arch; - return pass; -} - -INITIALIZE_PASS_BEGIN( - ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory", - "Polly - Rewrite all allocations in heap & data section to managed memory", - false, false) -INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo); -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); -INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); -INITIALIZE_PASS_END( - ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory", - "Polly - Rewrite all allocations in heap & data section to managed memory", - false, false) diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp deleted file mode 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ /dev/null @@ -1,3657 +0,0 @@ -//===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Take a scop created by ScopInfo and map it to GPU code using the ppcg -// GPU mapping strategy. -// -//===----------------------------------------------------------------------===// - -#include "polly/CodeGen/PPCGCodeGeneration.h" -#include "polly/CodeGen/CodeGeneration.h" -#include "polly/CodeGen/IslAst.h" -#include "polly/CodeGen/IslNodeBuilder.h" -#include "polly/CodeGen/PerfMonitor.h" -#include "polly/CodeGen/Utils.h" -#include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" -#include "polly/Options.h" -#include "polly/ScopDetection.h" -#include "polly/ScopInfo.h" -#include "polly/Support/ISLTools.h" -#include "polly/Support/SCEVValidator.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/IntrinsicsNVPTX.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Verifier.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/InitializePasses.h" -#include "llvm/Linker/Linker.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "isl/union_map.h" -#include - -extern "C" { -#include "ppcg/cuda.h" -#include "ppcg/gpu.h" -#include "ppcg/ppcg.h" -} - -#include "llvm/Support/Debug.h" - -using namespace polly; -using namespace llvm; - -#define DEBUG_TYPE "polly-codegen-ppcg" - -static cl::opt DumpSchedule("polly-acc-dump-schedule", - cl::desc("Dump the computed GPU Schedule"), - cl::Hidden, cl::cat(PollyCategory)); - -static cl::opt - DumpCode("polly-acc-dump-code", - cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, - cl::cat(PollyCategory)); - -static cl::opt DumpKernelIR("polly-acc-dump-kernel-ir", - cl::desc("Dump the kernel LLVM-IR"), - cl::Hidden, cl::cat(PollyCategory)); - -static cl::opt DumpKernelASM("polly-acc-dump-kernel-asm", - cl::desc("Dump the kernel assembly code"), - cl::Hidden, cl::cat(PollyCategory)); - -static cl::opt FastMath("polly-acc-fastmath", - cl::desc("Allow unsafe math optimizations"), - cl::Hidden, cl::cat(PollyCategory)); -static cl::opt SharedMemory("polly-acc-use-shared", - cl::desc("Use shared memory"), cl::Hidden, - cl::cat(PollyCategory)); -static cl::opt PrivateMemory("polly-acc-use-private", - cl::desc("Use private memory"), cl::Hidden, - cl::cat(PollyCategory)); - -bool polly::PollyManagedMemory; -static cl::opt - XManagedMemory("polly-acc-codegen-managed-memory", - cl::desc("Generate Host kernel code assuming" - " that all memory has been" - " declared as managed memory"), - cl::location(PollyManagedMemory), cl::Hidden, - cl::init(false), cl::cat(PollyCategory)); - -static cl::opt - FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure", - cl::desc("Fail and generate a backtrace if" - " verifyModule fails on the GPU " - " kernel module."), - cl::Hidden, cl::cat(PollyCategory)); - -static cl::opt CUDALibDevice( - "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden, - cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"), - cl::cat(PollyCategory)); - -static cl::opt - CudaVersion("polly-acc-cuda-version", - cl::desc("The CUDA version to compile for"), cl::Hidden, - cl::init("sm_30"), cl::cat(PollyCategory)); - -static cl::opt - MinCompute("polly-acc-mincompute", - cl::desc("Minimal number of compute statements to run on GPU."), - cl::Hidden, cl::init(10 * 512 * 512)); - -GPURuntime polly::GPURuntimeChoice; -static cl::opt - XGPURuntimeChoice("polly-gpu-runtime", - cl::desc("The GPU Runtime API to target"), - cl::values(clEnumValN(GPURuntime::CUDA, "libcudart", - "use the CUDA Runtime API"), - clEnumValN(GPURuntime::OpenCL, "libopencl", - "use the OpenCL Runtime API")), - cl::location(polly::GPURuntimeChoice), - cl::init(GPURuntime::CUDA), cl::cat(PollyCategory)); - -GPUArch polly::GPUArchChoice; -static cl::opt - XGPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), - cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", - "target NVIDIA 64-bit architecture"), - clEnumValN(GPUArch::SPIR32, "spir32", - "target SPIR 32-bit architecture"), - clEnumValN(GPUArch::SPIR64, "spir64", - "target SPIR 64-bit architecture")), - cl::location(polly::GPUArchChoice), - cl::init(GPUArch::NVPTX64), cl::cat(PollyCategory)); - -extern bool polly::PerfMonitoring; - -/// Return a unique name for a Scop, which is the scop region with the -/// function name. -std::string getUniqueScopName(const Scop *S) { - return "Scop Region: " + S->getNameStr() + - " | Function: " + std::string(S->getFunction().getName()); -} - -/// Used to store information PPCG wants for kills. This information is -/// used by live range reordering. -/// -/// @see computeLiveRangeReordering -/// @see GPUNodeBuilder::createPPCGScop -/// @see GPUNodeBuilder::createPPCGProg -struct MustKillsInfo { - /// Collection of all kill statements that will be sequenced at the end of - /// PPCGScop->schedule. - /// - /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set` - /// which merges schedules in *arbitrary* order. - /// (we don't care about the order of the kills anyway). - isl::schedule KillsSchedule; - /// Map from kill statement instances to scalars that need to be - /// killed. - /// - /// We currently derive kill information for: - /// 1. phi nodes. PHI nodes are not alive outside the scop and can - /// consequently all be killed. - /// 2. Scalar arrays that are not used outside the Scop. This is - /// checked by `isScalarUsesContainedInScop`. - /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } - isl::union_map TaggedMustKills; - - /// Tagged must kills stripped of the tags. - /// [params] -> { Stmt_phantom[] -> scalar_to_kill[] } - isl::union_map MustKills; - - MustKillsInfo() : KillsSchedule() {} -}; - -/// Check if SAI's uses are entirely contained within Scop S. -/// If a scalar is used only with a Scop, we are free to kill it, as no data -/// can flow in/out of the value any more. -/// @see computeMustKillsInfo -static bool isScalarUsesContainedInScop(const Scop &S, - const ScopArrayInfo *SAI) { - assert(SAI->isValueKind() && "this function only deals with scalars." - " Dealing with arrays required alias analysis"); - - const Region &R = S.getRegion(); - for (User *U : SAI->getBasePtr()->users()) { - Instruction *I = dyn_cast(U); - assert(I && "invalid user of scop array info"); - if (!R.contains(I)) - return false; - } - return true; -} - -/// Compute must-kills needed to enable live range reordering with PPCG. -/// -/// @params S The Scop to compute live range reordering information -/// @returns live range reordering information that can be used to setup -/// PPCG. -static MustKillsInfo computeMustKillsInfo(const Scop &S) { - const isl::space ParamSpace = S.getParamSpace(); - MustKillsInfo Info; - - // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria: - // 1.1 phi nodes in scop. - // 1.2 scalars that are only used within the scop - SmallVector KillMemIds; - for (ScopArrayInfo *SAI : S.arrays()) { - if (SAI->isPHIKind() || - (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI))) - KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release())); - } - - Info.TaggedMustKills = isl::union_map::empty(ParamSpace.ctx()); - Info.MustKills = isl::union_map::empty(ParamSpace.ctx()); - - // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the - // schedule: - // - filter: "[control] -> { }" - // So, we choose to not create this to keep the output a little nicer, - // at the cost of some code complexity. - Info.KillsSchedule = {}; - - for (isl::id &ToKillId : KillMemIds) { - isl::id KillStmtId = isl::id::alloc( - S.getIslCtx(), - std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr); - - // NOTE: construction of tagged_must_kill: - // 2. We need to construct a map: - // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } - // To construct this, we use `isl_map_domain_product` on 2 maps`: - // 2a. StmtToScalar: - // [param] -> { Stmt_phantom[] -> scalar_to_kill[] } - // 2b. PhantomRefToScalar: - // [param] -> { ref_phantom[] -> scalar_to_kill[] } - // - // Combining these with `isl_map_domain_product` gives us - // TaggedMustKill: - // [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } - - // 2a. [param] -> { Stmt[] -> scalar_to_kill[] } - isl::map StmtToScalar = isl::map::universe(ParamSpace); - StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId)); - StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId)); - - isl::id PhantomRefId = isl::id::alloc( - S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(), - nullptr); - - // 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] } - isl::map PhantomRefToScalar = isl::map::universe(ParamSpace); - PhantomRefToScalar = - PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId); - PhantomRefToScalar = - PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId); - - // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } - isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar); - Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill); - - // 2. [param] -> { Stmt[] -> scalar_to_kill[] } - Info.MustKills = Info.TaggedMustKills.domain_factor_domain(); - - // 3. Create the kill schedule of the form: - // "[param] -> { Stmt_phantom[] }" - // Then add this to Info.KillsSchedule. - isl::space KillStmtSpace = ParamSpace; - KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId); - isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace); - - isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain); - if (!Info.KillsSchedule.is_null()) - Info.KillsSchedule = isl::manage( - isl_schedule_set(Info.KillsSchedule.release(), KillSchedule.copy())); - else - Info.KillsSchedule = KillSchedule; - } - - return Info; -} - -/// Create the ast expressions for a ScopStmt. -/// -/// This function is a callback for to generate the ast expressions for each -/// of the scheduled ScopStmts. -static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( - void *StmtT, __isl_take isl_ast_build *Build_C, - isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, - isl_id *Id, void *User), - void *UserIndex, - isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), - void *UserExpr) { - - ScopStmt *Stmt = (ScopStmt *)StmtT; - - if (!Stmt || !Build_C) - return NULL; - - isl::ast_build Build = isl::manage_copy(Build_C); - isl::ctx Ctx = Build.ctx(); - isl::id_to_ast_expr RefToExpr = isl::id_to_ast_expr::alloc(Ctx, 0); - - Stmt->setAstBuild(Build); - - for (MemoryAccess *Acc : *Stmt) { - isl::map AddrFunc = Acc->getAddressFunction(); - AddrFunc = AddrFunc.intersect_domain(Stmt->getDomain()); - - isl::id RefId = Acc->getId(); - isl::pw_multi_aff PMA = isl::pw_multi_aff::from_map(AddrFunc); - - isl::multi_pw_aff MPA = isl::multi_pw_aff(PMA); - MPA = MPA.coalesce(); - MPA = isl::manage(FunctionIndex(MPA.release(), RefId.get(), UserIndex)); - - isl::ast_expr Access = Build.access_from(MPA); - Access = isl::manage(FunctionExpr(Access.release(), RefId.get(), UserExpr)); - RefToExpr = RefToExpr.set(RefId, Access); - } - - return RefToExpr.release(); -} - -/// Given a LLVM Type, compute its size in bytes, -static int computeSizeInBytes(const Type *T) { - int bytes = T->getPrimitiveSizeInBits() / 8; - if (bytes == 0) - bytes = T->getScalarSizeInBits() / 8; - return bytes; -} - -/// Generate code for a GPU specific isl AST. -/// -/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which -/// generates code for general-purpose AST nodes, with special functionality -/// for generating GPU specific user nodes. -/// -/// @see GPUNodeBuilder::createUser -class GPUNodeBuilder final : public IslNodeBuilder { -public: - GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, - const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, - DominatorTree &DT, Scop &S, BasicBlock *StartBlock, - gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) - : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), - Prog(Prog), Runtime(Runtime), Arch(Arch) { - getExprBuilder().setIDToSAI(&IDToSAI); - } - - /// Create after-run-time-check initialization code. - void initializeAfterRTH(); - - /// Finalize the generated scop. - void finalize() override; - - /// Track if the full build process was successful. - /// - /// This value is set to false, if throughout the build process an error - /// occurred which prevents us from generating valid GPU code. - bool BuildSuccessful = true; - - /// The maximal number of loops surrounding a sequential kernel. - unsigned DeepestSequential = 0; - - /// The maximal number of loops surrounding a parallel kernel. - unsigned DeepestParallel = 0; - - /// Return the name to set for the ptx_kernel. - std::string getKernelFuncName(int Kernel_id); - -private: - /// A vector of array base pointers for which a new ScopArrayInfo was created. - /// - /// This vector is used to delete the ScopArrayInfo when it is not needed any - /// more. - std::vector LocalArrays; - - /// A map from ScopArrays to their corresponding device allocations. - std::map DeviceAllocations; - - /// The current GPU context. - Value *GPUContext; - - /// The set of isl_ids allocated in the kernel - std::vector KernelIds; - - /// A module containing GPU code. - /// - /// This pointer is only set in case we are currently generating GPU code. - std::unique_ptr GPUModule; - - /// The GPU program we generate code for. - gpu_prog *Prog; - - /// The GPU Runtime implementation to use (OpenCL or CUDA). - GPURuntime Runtime; - - /// The GPU Architecture to target. - GPUArch Arch; - - /// Class to free isl_ids. - class IslIdDeleter final { - public: - void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; - }; - - /// A set containing all isl_ids allocated in a GPU kernel. - /// - /// By releasing this set all isl_ids will be freed. - std::set> KernelIDs; - - IslExprBuilder::IDToScopArrayInfoTy IDToSAI; - - /// Create code for user-defined AST nodes. - /// - /// These AST nodes can be of type: - /// - /// - ScopStmt: A computational statement (TODO) - /// - Kernel: A GPU kernel call (TODO) - /// - Data-Transfer: A GPU <-> CPU data-transfer - /// - In-kernel synchronization - /// - In-kernel memory copy statement - /// - /// @param UserStmt The ast node to generate code for. - void createUser(__isl_take isl_ast_node *UserStmt) override; - - void createFor(__isl_take isl_ast_node *Node) override; - - enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; - - /// Create code for a data transfer statement - /// - /// @param TransferStmt The data transfer statement. - /// @param Direction The direction in which to transfer data. - void createDataTransfer(__isl_take isl_ast_node *TransferStmt, - enum DataDirection Direction); - - /// Find llvm::Values referenced in GPU kernel. - /// - /// @param Kernel The kernel to scan for llvm::Values - /// - /// @returns A tuple, whose: - /// - First element contains the set of values referenced by the - /// kernel - /// - Second element contains the set of functions referenced by the - /// kernel. All functions in the set satisfy - /// `isValidFunctionInKernel`. - /// - Third element contains loops that have induction variables - /// which are used in the kernel, *and* these loops are *neither* - /// in the scop, nor do they immediately surroung the Scop. - /// See [Code generation of induction variables of loops outside - /// Scops] - std::tuple, SetVector, SetVector, - isl::space> - getReferencesInKernel(ppcg_kernel *Kernel); - - /// Compute the sizes of the execution grid for a given kernel. - /// - /// @param Kernel The kernel to compute grid sizes for. - /// - /// @returns A tuple with grid sizes for X and Y dimension - std::tuple getGridSizes(ppcg_kernel *Kernel); - - /// Get the managed array pointer for sending host pointers to the device. - /// \note - /// This is to be used only with managed memory - Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo); - - /// Compute the sizes of the thread blocks for a given kernel. - /// - /// @param Kernel The kernel to compute thread block sizes for. - /// - /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. - std::tuple getBlockSizes(ppcg_kernel *Kernel); - - /// Store a specific kernel launch parameter in the array of kernel launch - /// parameters. - /// - /// @param ArrayTy Array type of \p Parameters. - /// @param Parameters The list of parameters in which to store. - /// @param Param The kernel launch parameter to store. - /// @param Index The index in the parameter list, at which to store the - /// parameter. - void insertStoreParameter(Type *ArrayTy, Instruction *Parameters, - Instruction *Param, int Index); - - /// Create kernel launch parameters. - /// - /// @param Kernel The kernel to create parameters for. - /// @param F The kernel function that has been created. - /// @param SubtreeValues The set of llvm::Values referenced by this kernel. - /// - /// @returns A stack allocated array with pointers to the parameter - /// values that are passed to the kernel. - Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, - SetVector SubtreeValues); - - /// Create declarations for kernel variable. - /// - /// This includes shared memory declarations. - /// - /// @param Kernel The kernel definition to create variables for. - /// @param FN The function into which to generate the variables. - void createKernelVariables(ppcg_kernel *Kernel, Function *FN); - - /// Add CUDA annotations to module. - /// - /// Add a set of CUDA annotations that declares the maximal block dimensions - /// that will be used to execute the CUDA kernel. This allows the NVIDIA - /// PTX compiler to bound the number of allocated registers to ensure the - /// resulting kernel is known to run with up to as many block dimensions - /// as specified here. - /// - /// @param M The module to add the annotations to. - /// @param BlockDimX The size of block dimension X. - /// @param BlockDimY The size of block dimension Y. - /// @param BlockDimZ The size of block dimension Z. - void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, - Value *BlockDimZ); - - /// Create GPU kernel. - /// - /// Code generate the kernel described by @p KernelStmt. - /// - /// @param KernelStmt The ast node to generate kernel code for. - void createKernel(__isl_take isl_ast_node *KernelStmt); - - /// Generate code that computes the size of an array. - /// - /// @param Array The array for which to compute a size. - Value *getArraySize(gpu_array_info *Array); - - /// Generate code to compute the minimal offset at which an array is accessed. - /// - /// The offset of an array is the minimal array location accessed in a scop. - /// - /// Example: - /// - /// for (long i = 0; i < 100; i++) - /// A[i + 42] += ... - /// - /// getArrayOffset(A) results in 42. - /// - /// @param Array The array for which to compute the offset. - /// @returns An llvm::Value that contains the offset of the array. - Value *getArrayOffset(gpu_array_info *Array); - - /// Prepare the kernel arguments for kernel code generation - /// - /// @param Kernel The kernel to generate code for. - /// @param FN The function created for the kernel. - void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); - - /// Create kernel function. - /// - /// Create a kernel function located in a newly created module that can serve - /// as target for device code generation. Set the Builder to point to the - /// start block of this newly created function. - /// - /// @param Kernel The kernel to generate code for. - /// @param SubtreeValues The set of llvm::Values referenced by this kernel. - /// @param SubtreeFunctions The set of llvm::Functions referenced by this - /// kernel. - void createKernelFunction(ppcg_kernel *Kernel, - SetVector &SubtreeValues, - SetVector &SubtreeFunctions); - - /// Create the declaration of a kernel function. - /// - /// The kernel function takes as arguments: - /// - /// - One i8 pointer for each external array reference used in the kernel. - /// - Host iterators - /// - Parameters - /// - Other LLVM Value references (TODO) - /// - /// @param Kernel The kernel to generate the function declaration for. - /// @param SubtreeValues The set of llvm::Values referenced by this kernel. - /// - /// @returns The newly declared function. - Function *createKernelFunctionDecl(ppcg_kernel *Kernel, - SetVector &SubtreeValues); - - /// Insert intrinsic functions to obtain thread and block ids. - /// - /// @param The kernel to generate the intrinsic functions for. - void insertKernelIntrinsics(ppcg_kernel *Kernel); - - /// Insert function calls to retrieve the SPIR group/local ids. - /// - /// @param Kernel The kernel to generate the function calls for. - /// @param SizeTypeIs64Bit Whether size_t of the openCl device is 64bit. - void insertKernelCallsSPIR(ppcg_kernel *Kernel, bool SizeTypeIs64bit); - - /// Setup the creation of functions referenced by the GPU kernel. - /// - /// 1. Create new function declarations in GPUModule which are the same as - /// SubtreeFunctions. - /// - /// 2. Populate IslNodeBuilder::ValueMap with mappings from - /// old functions (that come from the original module) to new functions - /// (that are created within GPUModule). That way, we generate references - /// to the correct function (in GPUModule) in BlockGenerator. - /// - /// @see IslNodeBuilder::ValueMap - /// @see BlockGenerator::GlobalMap - /// @see BlockGenerator::getNewValue - /// @see GPUNodeBuilder::getReferencesInKernel. - /// - /// @param SubtreeFunctions The set of llvm::Functions referenced by - /// this kernel. - void setupKernelSubtreeFunctions(SetVector SubtreeFunctions); - - /// Create a global-to-shared or shared-to-global copy statement. - /// - /// @param CopyStmt The copy statement to generate code for - void createKernelCopy(ppcg_kernel_stmt *CopyStmt); - - /// Create code for a ScopStmt called in @p Expr. - /// - /// @param Expr The expression containing the call. - /// @param KernelStmt The kernel statement referenced in the call. - void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); - - /// Create an in-kernel synchronization call. - void createKernelSync(); - - /// Create a PTX assembly string for the current GPU kernel. - /// - /// @returns A string containing the corresponding PTX assembly code. - std::string createKernelASM(); - - /// Remove references from the dominator tree to the kernel function @p F. - /// - /// @param F The function to remove references to. - void clearDominators(Function *F); - - /// Remove references from scalar evolution to the kernel function @p F. - /// - /// @param F The function to remove references to. - void clearScalarEvolution(Function *F); - - /// Remove references from loop info to the kernel function @p F. - /// - /// @param F The function to remove references to. - void clearLoops(Function *F); - - /// Check if the scop requires to be linked with CUDA's libdevice. - bool requiresCUDALibDevice(); - - /// Link with the NVIDIA libdevice library (if needed and available). - void addCUDALibDevice(); - - /// Finalize the generation of the kernel function. - /// - /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- - /// dump its IR to stderr. - /// - /// @returns The Assembly string of the kernel. - std::string finalizeKernelFunction(); - - /// Finalize the generation of the kernel arguments. - /// - /// This function ensures that not-read-only scalars used in a kernel are - /// stored back to the global memory location they are backed with before - /// the kernel terminates. - /// - /// @params Kernel The kernel to finalize kernel arguments for. - void finalizeKernelArguments(ppcg_kernel *Kernel); - - /// Create code that allocates memory to store arrays on device. - void allocateDeviceArrays(); - - /// Create code to prepare the managed device pointers. - void prepareManagedDeviceArrays(); - - /// Free all allocated device arrays. - void freeDeviceArrays(); - - /// Create a call to initialize the GPU context. - /// - /// @returns A pointer to the newly initialized context. - Value *createCallInitContext(); - - /// Create a call to get the device pointer for a kernel allocation. - /// - /// @param Allocation The Polly GPU allocation - /// - /// @returns The device parameter corresponding to this allocation. - Value *createCallGetDevicePtr(Value *Allocation); - - /// Create a call to free the GPU context. - /// - /// @param Context A pointer to an initialized GPU context. - void createCallFreeContext(Value *Context); - - /// Create a call to allocate memory on the device. - /// - /// @param Size The size of memory to allocate - /// - /// @returns A pointer that identifies this allocation. - Value *createCallAllocateMemoryForDevice(Value *Size); - - /// Create a call to free a device array. - /// - /// @param Array The device array to free. - void createCallFreeDeviceMemory(Value *Array); - - /// Create a call to copy data from host to device. - /// - /// @param HostPtr A pointer to the host data that should be copied. - /// @param DevicePtr A device pointer specifying the location to copy to. - void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, - Value *Size); - - /// Create a call to copy data from device to host. - /// - /// @param DevicePtr A pointer to the device data that should be copied. - /// @param HostPtr A host pointer specifying the location to copy to. - void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, - Value *Size); - - /// Create a call to synchronize Host & Device. - /// \note - /// This is to be used only with managed memory. - void createCallSynchronizeDevice(); - - /// Create a call to get a kernel from an assembly string. - /// - /// @param Buffer The string describing the kernel. - /// @param Entry The name of the kernel function to call. - /// - /// @returns A pointer to a kernel object - Value *createCallGetKernel(Value *Buffer, Value *Entry); - - /// Create a call to free a GPU kernel. - /// - /// @param GPUKernel THe kernel to free. - void createCallFreeKernel(Value *GPUKernel); - - /// Create a call to launch a GPU kernel. - /// - /// @param GPUKernel The kernel to launch. - /// @param GridDimX The size of the first grid dimension. - /// @param GridDimY The size of the second grid dimension. - /// @param GridBlockX The size of the first block dimension. - /// @param GridBlockY The size of the second block dimension. - /// @param GridBlockZ The size of the third block dimension. - /// @param Parameters A pointer to an array that contains itself pointers to - /// the parameter values passed for each kernel argument. - void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, - Value *GridDimY, Value *BlockDimX, - Value *BlockDimY, Value *BlockDimZ, - Value *Parameters); -}; - -std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) { - return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" + - std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id); -} - -void GPUNodeBuilder::initializeAfterRTH() { - BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), - &*Builder.GetInsertPoint(), &DT, &LI); - NewBB->setName("polly.acc.initialize"); - Builder.SetInsertPoint(&NewBB->front()); - - GPUContext = createCallInitContext(); - - if (!PollyManagedMemory) - allocateDeviceArrays(); - else - prepareManagedDeviceArrays(); -} - -void GPUNodeBuilder::finalize() { - if (!PollyManagedMemory) - freeDeviceArrays(); - - createCallFreeContext(GPUContext); - IslNodeBuilder::finalize(); -} - -void GPUNodeBuilder::allocateDeviceArrays() { - assert(!PollyManagedMemory && - "Managed memory will directly send host pointers " - "to the kernel. There is no need for device arrays"); - isl_ast_build *Build = isl_ast_build_from_context(S.getContext().release()); - - for (int i = 0; i < Prog->n_array; ++i) { - gpu_array_info *Array = &Prog->array[i]; - auto *ScopArray = (ScopArrayInfo *)Array->user; - std::string DevArrayName("p_dev_array_"); - DevArrayName.append(Array->name); - - Value *ArraySize = getArraySize(Array); - Value *Offset = getArrayOffset(Array); - if (Offset) - ArraySize = Builder.CreateSub( - ArraySize, - Builder.CreateMul(Offset, - Builder.getInt64(ScopArray->getElemSizeInBytes()))); - const SCEV *SizeSCEV = SE.getSCEV(ArraySize); - // It makes no sense to have an array of size 0. The CUDA API will - // throw an error anyway if we invoke `cuMallocManaged` with size `0`. We - // choose to be defensive and catch this at the compile phase. It is - // most likely that we are doing something wrong with size computation. - if (SizeSCEV->isZero()) { - errs() << getUniqueScopName(&S) - << " has computed array size 0: " << *ArraySize - << " | for array: " << *(ScopArray->getBasePtr()) - << ". This is illegal, exiting.\n"; - report_fatal_error("array size was computed to be 0"); - } - - Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); - DevArray->setName(DevArrayName); - DeviceAllocations[ScopArray] = DevArray; - } - - isl_ast_build_free(Build); -} - -void GPUNodeBuilder::prepareManagedDeviceArrays() { - assert(PollyManagedMemory && - "Device array most only be prepared in managed-memory mode"); - for (int i = 0; i < Prog->n_array; ++i) { - gpu_array_info *Array = &Prog->array[i]; - ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user; - Value *HostPtr; - - if (gpu_array_is_scalar(Array)) - HostPtr = BlockGen.getOrCreateAlloca(ScopArray); - else - HostPtr = ScopArray->getBasePtr(); - HostPtr = getLatestValue(HostPtr); - - Value *Offset = getArrayOffset(Array); - if (Offset) { - HostPtr = Builder.CreatePointerCast( - HostPtr, ScopArray->getElementType()->getPointerTo()); - HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset); - } - - HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); - DeviceAllocations[ScopArray] = HostPtr; - } -} - -void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, - Value *BlockDimY, Value *BlockDimZ) { - auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); - - for (auto &F : *M) { - if (F.getCallingConv() != CallingConv::PTX_Kernel) - continue; - - Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; - - Metadata *Elements[] = { - ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), - ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), - ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), - ValueAsMetadata::get(V[2]), - }; - MDNode *Node = MDNode::get(M->getContext(), Elements); - AnnotationNode->addOperand(Node); - } -} - -void GPUNodeBuilder::freeDeviceArrays() { - assert(!PollyManagedMemory && "Managed memory does not use device arrays"); - for (auto &Array : DeviceAllocations) - createCallFreeDeviceMemory(Array.second); -} - -Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { - const char *Name = "polly_getKernel"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - Args.push_back(Builder.getInt8PtrTy()); - FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - return Builder.CreateCall(F, {Buffer, Entry}); -} - -Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { - const char *Name = "polly_getDevicePtr"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - return Builder.CreateCall(F, {Allocation}); -} - -void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, - Value *GridDimY, Value *BlockDimX, - Value *BlockDimY, Value *BlockDimZ, - Value *Parameters) { - const char *Name = "polly_launchKernel"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - Args.push_back(Builder.getInt32Ty()); - Args.push_back(Builder.getInt32Ty()); - Args.push_back(Builder.getInt32Ty()); - Args.push_back(Builder.getInt32Ty()); - Args.push_back(Builder.getInt32Ty()); - Args.push_back(Builder.getInt8PtrTy()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, - BlockDimZ, Parameters}); -} - -void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { - const char *Name = "polly_freeKernel"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall(F, {GPUKernel}); -} - -void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { - assert(!PollyManagedMemory && - "Managed memory does not allocate or free memory " - "for device"); - const char *Name = "polly_freeDeviceMemory"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall(F, {Array}); -} - -Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { - assert(!PollyManagedMemory && - "Managed memory does not allocate or free memory " - "for device"); - const char *Name = "polly_allocateMemoryForDevice"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt64Ty()); - FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - return Builder.CreateCall(F, {Size}); -} - -void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, - Value *DeviceData, - Value *Size) { - assert(!PollyManagedMemory && - "Managed memory does not transfer memory between " - "device and host"); - const char *Name = "polly_copyFromHostToDevice"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - Args.push_back(Builder.getInt8PtrTy()); - Args.push_back(Builder.getInt64Ty()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall(F, {HostData, DeviceData, Size}); -} - -void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, - Value *HostData, - Value *Size) { - assert(!PollyManagedMemory && - "Managed memory does not transfer memory between " - "device and host"); - const char *Name = "polly_copyFromDeviceToHost"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - Args.push_back(Builder.getInt8PtrTy()); - Args.push_back(Builder.getInt64Ty()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall(F, {DeviceData, HostData, Size}); -} - -void GPUNodeBuilder::createCallSynchronizeDevice() { - assert(PollyManagedMemory && "explicit synchronization is only necessary for " - "managed memory"); - const char *Name = "polly_synchronizeDevice"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall(F); -} - -Value *GPUNodeBuilder::createCallInitContext() { - const char *Name; - - switch (Runtime) { - case GPURuntime::CUDA: - Name = "polly_initContextCUDA"; - break; - case GPURuntime::OpenCL: - Name = "polly_initContextCL"; - break; - } - - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - return Builder.CreateCall(F, {}); -} - -void GPUNodeBuilder::createCallFreeContext(Value *Context) { - const char *Name = "polly_freeContext"; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - Args.push_back(Builder.getInt8PtrTy()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall(F, {Context}); -} - -/// Check if one string is a prefix of another. -/// -/// @param String The string in which to look for the prefix. -/// @param Prefix The prefix to look for. -static bool isPrefix(std::string String, std::string Prefix) { - return String.find(Prefix) == 0; -} - -Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { - isl::ast_build Build = isl::ast_build::from_context(S.getContext()); - Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); - - if (!gpu_array_is_scalar(Array)) { - isl::multi_pw_aff ArrayBound = isl::manage_copy(Array->bound); - - isl::pw_aff OffsetDimZero = ArrayBound.at(0); - isl::ast_expr Res = Build.expr_from(OffsetDimZero); - - for (unsigned int i = 1; i < Array->n_index; i++) { - isl::pw_aff Bound_I = ArrayBound.at(i); - isl::ast_expr Expr = Build.expr_from(Bound_I); - Res = Res.mul(Expr); - } - - Value *NumElements = ExprBuilder.create(Res.release()); - if (NumElements->getType() != ArraySize->getType()) - NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); - ArraySize = Builder.CreateMul(ArraySize, NumElements); - } - return ArraySize; -} - -Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { - if (gpu_array_is_scalar(Array)) - return nullptr; - - isl::ast_build Build = isl::ast_build::from_context(S.getContext()); - - isl::set Min = isl::manage_copy(Array->extent).lexmin(); - - isl::set ZeroSet = isl::set::universe(Min.get_space()); - - for (unsigned i : rangeIslSize(0, Min.tuple_dim())) - ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0); - - if (Min.is_subset(ZeroSet)) { - return nullptr; - } - - isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.ctx(), 0)); - - for (unsigned i : rangeIslSize(0, Min.tuple_dim())) { - if (i > 0) { - isl::pw_aff Bound_I = - isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1)); - isl::ast_expr BExpr = Build.expr_from(Bound_I); - Result = Result.mul(BExpr); - } - isl::pw_aff DimMin = Min.dim_min(i); - isl::ast_expr MExpr = Build.expr_from(DimMin); - Result = Result.add(MExpr); - } - - return ExprBuilder.create(Result.release()); -} - -Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array, - ScopArrayInfo *ArrayInfo) { - assert(PollyManagedMemory && "Only used when you wish to get a host " - "pointer for sending data to the kernel, " - "with managed memory"); - std::map::iterator it; - it = DeviceAllocations.find(ArrayInfo); - assert(it != DeviceAllocations.end() && - "Device array expected to be available"); - return it->second; -} - -void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, - enum DataDirection Direction) { - assert(!PollyManagedMemory && "Managed memory needs no data transfers"); - isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); - isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); - isl_id *Id = isl_ast_expr_get_id(Arg); - auto Array = (gpu_array_info *)isl_id_get_user(Id); - auto ScopArray = (ScopArrayInfo *)(Array->user); - - Value *Size = getArraySize(Array); - Value *Offset = getArrayOffset(Array); - Value *DevPtr = DeviceAllocations[ScopArray]; - - Value *HostPtr; - - if (gpu_array_is_scalar(Array)) - HostPtr = BlockGen.getOrCreateAlloca(ScopArray); - else - HostPtr = ScopArray->getBasePtr(); - HostPtr = getLatestValue(HostPtr); - - if (Offset) { - HostPtr = Builder.CreatePointerCast( - HostPtr, ScopArray->getElementType()->getPointerTo()); - HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset); - } - - HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); - - if (Offset) { - Size = Builder.CreateSub( - Size, Builder.CreateMul( - Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); - } - - if (Direction == HOST_TO_DEVICE) - createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); - else - createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); - - isl_id_free(Id); - isl_ast_expr_free(Arg); - isl_ast_expr_free(Expr); - isl_ast_node_free(TransferStmt); -} - -void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { - isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); - isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); - isl_id *Id = isl_ast_expr_get_id(StmtExpr); - isl_id_free(Id); - isl_ast_expr_free(StmtExpr); - - const char *Str = isl_id_get_name(Id); - if (!strcmp(Str, "kernel")) { - createKernel(UserStmt); - if (PollyManagedMemory) - createCallSynchronizeDevice(); - isl_ast_expr_free(Expr); - return; - } - if (!strcmp(Str, "init_device")) { - initializeAfterRTH(); - isl_ast_node_free(UserStmt); - isl_ast_expr_free(Expr); - return; - } - if (!strcmp(Str, "clear_device")) { - finalize(); - isl_ast_node_free(UserStmt); - isl_ast_expr_free(Expr); - return; - } - if (isPrefix(Str, "to_device")) { - if (!PollyManagedMemory) - createDataTransfer(UserStmt, HOST_TO_DEVICE); - else - isl_ast_node_free(UserStmt); - - isl_ast_expr_free(Expr); - return; - } - - if (isPrefix(Str, "from_device")) { - if (!PollyManagedMemory) { - createDataTransfer(UserStmt, DEVICE_TO_HOST); - } else { - isl_ast_node_free(UserStmt); - } - isl_ast_expr_free(Expr); - return; - } - - isl_id *Anno = isl_ast_node_get_annotation(UserStmt); - struct ppcg_kernel_stmt *KernelStmt = - (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); - isl_id_free(Anno); - - switch (KernelStmt->type) { - case ppcg_kernel_domain: - createScopStmt(Expr, KernelStmt); - isl_ast_node_free(UserStmt); - return; - case ppcg_kernel_copy: - createKernelCopy(KernelStmt); - isl_ast_expr_free(Expr); - isl_ast_node_free(UserStmt); - return; - case ppcg_kernel_sync: - createKernelSync(); - isl_ast_expr_free(Expr); - isl_ast_node_free(UserStmt); - return; - } - - isl_ast_expr_free(Expr); - isl_ast_node_free(UserStmt); -} - -void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) { - createForSequential(isl::manage(Node).as(), false); -} - -void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { - isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); - auto LocalAddr = ExprBuilder.createAccessAddress(LocalIndex); - isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); - auto GlobalAddr = ExprBuilder.createAccessAddress(Index); - - if (KernelStmt->u.c.read) { - LoadInst *Load = - Builder.CreateLoad(GlobalAddr.second, GlobalAddr.first, "shared.read"); - Builder.CreateStore(Load, LocalAddr.first); - } else { - LoadInst *Load = - Builder.CreateLoad(LocalAddr.second, LocalAddr.first, "shared.write"); - Builder.CreateStore(Load, GlobalAddr.first); - } -} - -void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, - ppcg_kernel_stmt *KernelStmt) { - auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; - isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; - - LoopToScevMapT LTS; - LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); - - createSubstitutions(Expr, Stmt, LTS); - - if (Stmt->isBlockStmt()) - BlockGen.copyStmt(*Stmt, LTS, Indexes); - else - RegionGen.copyStmt(*Stmt, LTS, Indexes); -} - -void GPUNodeBuilder::createKernelSync() { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - const char *SpirName = "__gen_ocl_barrier_global"; - - Function *Sync; - - switch (Arch) { - case GPUArch::SPIR64: - case GPUArch::SPIR32: - Sync = M->getFunction(SpirName); - - // If Sync is not available, declare it. - if (!Sync) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - Sync = Function::Create(Ty, Linkage, SpirName, M); - Sync->setCallingConv(CallingConv::SPIR_FUNC); - } - break; - case GPUArch::NVPTX64: - Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); - break; - } - - Builder.CreateCall(Sync, {}); -} - -/// Collect llvm::Values referenced from @p Node -/// -/// This function only applies to isl_ast_nodes that are user_nodes referring -/// to a ScopStmt. All other node types are ignore. -/// -/// @param Node The node to collect references for. -/// @param User A user pointer used as storage for the data that is collected. -/// -/// @returns isl_bool_true if data could be collected successfully. -isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { - if (isl_ast_node_get_type(Node) != isl_ast_node_user) - return isl_bool_true; - - isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); - isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); - isl_id *Id = isl_ast_expr_get_id(StmtExpr); - const char *Str = isl_id_get_name(Id); - isl_id_free(Id); - isl_ast_expr_free(StmtExpr); - isl_ast_expr_free(Expr); - - if (!isPrefix(Str, "Stmt")) - return isl_bool_true; - - Id = isl_ast_node_get_annotation(Node); - auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); - auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; - isl_id_free(Id); - - addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); - - return isl_bool_true; -} - -/// A list of functions that are available in NVIDIA's libdevice. -const std::set CUDALibDeviceFunctions = { - "exp", "expf", "expl", "cos", "cosf", "sqrt", "sqrtf", - "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"}; - -// A map from intrinsics to their corresponding libdevice functions. -const std::map IntrinsicToLibdeviceFunc = { - {"llvm.exp.f64", "exp"}, - {"llvm.exp.f32", "expf"}, - {"llvm.powi.f64.i32", "powi"}, - {"llvm.powi.f32.i32", "powif"}}; - -/// Return the corresponding CUDA libdevice function name @p Name. -/// Note that this function will try to convert instrinsics in the list -/// IntrinsicToLibdeviceFunc into libdevice functions. -/// This is because some intrinsics such as `exp` -/// are not supported by the NVPTX backend. -/// If this restriction of the backend is lifted, we should refactor our code -/// so that we use intrinsics whenever possible. -/// -/// Return "" if we are not compiling for CUDA. -std::string getCUDALibDeviceFuntion(StringRef NameRef) { - std::string Name = NameRef.str(); - auto It = IntrinsicToLibdeviceFunc.find(Name); - if (It != IntrinsicToLibdeviceFunc.end()) - return getCUDALibDeviceFuntion(It->second); - - if (CUDALibDeviceFunctions.count(Name)) - return ("__nv_" + Name); - - return ""; -} - -/// Check if F is a function that we can code-generate in a GPU kernel. -static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) { - assert(F && "F is an invalid pointer"); - // We string compare against the name of the function to allow - // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and - // "llvm.copysign". - const StringRef Name = F->getName(); - - if (AllowLibDevice && getCUDALibDeviceFuntion(Name).length() > 0) - return true; - - return F->isIntrinsic() && - (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || - Name.startswith("llvm.copysign")); -} - -/// Do not take `Function` as a subtree value. -/// -/// We try to take the reference of all subtree values and pass them along -/// to the kernel from the host. Taking an address of any function and -/// trying to pass along is nonsensical. Only allow `Value`s that are not -/// `Function`s. -static bool isValidSubtreeValue(llvm::Value *V) { return !isa(V); } - -/// Return `Function`s from `RawSubtreeValues`. -static SetVector -getFunctionsFromRawSubtreeValues(SetVector RawSubtreeValues, - bool AllowCUDALibDevice) { - SetVector SubtreeFunctions; - for (Value *It : RawSubtreeValues) { - Function *F = dyn_cast(It); - if (F) { - assert(isValidFunctionInKernel(F, AllowCUDALibDevice) && - "Code should have bailed out by " - "this point if an invalid function " - "were present in a kernel."); - SubtreeFunctions.insert(F); - } - } - return SubtreeFunctions; -} - -std::tuple, SetVector, SetVector, - isl::space> -GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { - SetVector SubtreeValues; - SetVector SCEVs; - SetVector Loops; - isl::space ParamSpace = isl::space(S.getIslCtx(), 0, 0).params(); - SubtreeReferences References = { - LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(), - &ParamSpace}; - - for (const auto &I : IDToValue) - SubtreeValues.insert(I.second); - - // NOTE: this is populated in IslNodeBuilder::addParameters - // See [Code generation of induction variables of loops outside Scops]. - for (const auto &I : OutsideLoopIterations) - SubtreeValues.insert(cast(I.second)->getValue()); - - isl_ast_node_foreach_descendant_top_down( - Kernel->tree, collectReferencesInGPUStmt, &References); - - for (const SCEV *Expr : SCEVs) { - findValues(Expr, SE, SubtreeValues); - findLoops(Expr, Loops); - } - - Loops.remove_if([this](const Loop *L) { - return S.contains(L) || L->contains(S.getEntry()); - }); - - for (auto &SAI : S.arrays()) - SubtreeValues.remove(SAI->getBasePtr()); - - isl_space *Space = S.getParamSpace().release(); - for (long i = 0, n = isl_space_dim(Space, isl_dim_param); i < n; i++) { - isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); - assert(IDToValue.count(Id)); - Value *Val = IDToValue[Id]; - SubtreeValues.remove(Val); - isl_id_free(Id); - } - isl_space_free(Space); - - for (long i = 0, n = isl_space_dim(Kernel->space, isl_dim_set); i < n; i++) { - isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); - assert(IDToValue.count(Id)); - Value *Val = IDToValue[Id]; - SubtreeValues.remove(Val); - isl_id_free(Id); - } - - // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions - // SubtreeValues. This is important, because we should not lose any - // SubtreeValues in the process of constructing the - // "ValidSubtree{Values, Functions} sets. Nor should the set - // ValidSubtree{Values, Functions} have any common element. - auto ValidSubtreeValuesIt = - make_filter_range(SubtreeValues, isValidSubtreeValue); - SetVector ValidSubtreeValues(ValidSubtreeValuesIt.begin(), - ValidSubtreeValuesIt.end()); - - bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64; - - SetVector ValidSubtreeFunctions( - getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice)); - - // @see IslNodeBuilder::getReferencesInSubtree - SetVector ReplacedValues; - for (Value *V : ValidSubtreeValues) { - auto It = ValueMap.find(V); - if (It == ValueMap.end()) - ReplacedValues.insert(V); - else - ReplacedValues.insert(It->second); - } - return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops, - ParamSpace); -} - -void GPUNodeBuilder::clearDominators(Function *F) { - DomTreeNode *N = DT.getNode(&F->getEntryBlock()); - std::vector Nodes; - for (po_iterator I = po_begin(N), E = po_end(N); I != E; ++I) - Nodes.push_back(I->getBlock()); - - for (BasicBlock *BB : Nodes) - DT.eraseNode(BB); -} - -void GPUNodeBuilder::clearScalarEvolution(Function *F) { - for (BasicBlock &BB : *F) { - Loop *L = LI.getLoopFor(&BB); - if (L) - SE.forgetLoop(L); - } -} - -void GPUNodeBuilder::clearLoops(Function *F) { - SmallSet WorkList; - for (BasicBlock &BB : *F) { - Loop *L = LI.getLoopFor(&BB); - if (L) - WorkList.insert(L); - } - for (auto *L : WorkList) - LI.erase(L); -} - -std::tuple GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { - std::vector Sizes; - isl::ast_build Context = isl::ast_build::from_context(S.getContext()); - - isl::multi_pw_aff GridSizePwAffs = isl::manage_copy(Kernel->grid_size); - for (long i = 0; i < Kernel->n_grid; i++) { - isl::pw_aff Size = GridSizePwAffs.at(i); - isl::ast_expr GridSize = Context.expr_from(Size); - Value *Res = ExprBuilder.create(GridSize.release()); - Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); - Sizes.push_back(Res); - } - - for (long i = Kernel->n_grid; i < 3; i++) - Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); - - return std::make_tuple(Sizes[0], Sizes[1]); -} - -std::tuple -GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { - std::vector Sizes; - - for (long i = 0; i < Kernel->n_block; i++) { - Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); - Sizes.push_back(Res); - } - - for (long i = Kernel->n_block; i < 3; i++) - Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); - - return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); -} - -void GPUNodeBuilder::insertStoreParameter(Type *ArrayTy, - Instruction *Parameters, - Instruction *Param, int Index) { - Value *Slot = Builder.CreateGEP( - ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); -} - -Value * -GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, - SetVector SubtreeValues) { - const int NumArgs = F->arg_size(); - std::vector ArgSizes(NumArgs); - - // If we are using the OpenCL Runtime, we need to add the kernel argument - // sizes to the end of the launch-parameter list, so OpenCL can determine - // how big the respective kernel arguments are. - // Here we need to reserve adequate space for that. - Type *ArrayTy; - if (Runtime == GPURuntime::OpenCL) - ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); - else - ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumArgs); - - BasicBlock *EntryBlock = - &Builder.GetInsertBlock()->getParent()->getEntryBlock(); - auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); - std::string Launch = "polly_launch_" + std::to_string(Kernel->id); - Instruction *Parameters = new AllocaInst( - ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); - - int Index = 0; - for (long i = 0; i < Prog->n_array; i++) { - if (!ppcg_kernel_requires_array_argument(Kernel, i)) - continue; - - isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); - const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); - - if (Runtime == GPURuntime::OpenCL) - ArgSizes[Index] = SAI->getElemSizeInBytes(); - - Value *DevArray = nullptr; - if (PollyManagedMemory) { - DevArray = getManagedDeviceArray(&Prog->array[i], - const_cast(SAI)); - } else { - DevArray = DeviceAllocations[const_cast(SAI)]; - DevArray = createCallGetDevicePtr(DevArray); - } - assert(DevArray != nullptr && "Array to be offloaded to device not " - "initialized"); - Value *Offset = getArrayOffset(&Prog->array[i]); - - if (Offset) { - DevArray = Builder.CreatePointerCast( - DevArray, SAI->getElementType()->getPointerTo()); - DevArray = Builder.CreateGEP(SAI->getElementType(), DevArray, - Builder.CreateNeg(Offset)); - DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); - } - Value *Slot = Builder.CreateGEP( - ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - - if (gpu_array_is_read_only_scalar(&Prog->array[i])) { - Value *ValPtr = nullptr; - if (PollyManagedMemory) - ValPtr = DevArray; - else - ValPtr = BlockGen.getOrCreateAlloca(SAI); - - assert(ValPtr != nullptr && "ValPtr that should point to a valid object" - " to be stored into Parameters"); - Value *ValPtrCast = - Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); - Builder.CreateStore(ValPtrCast, Slot); - } else { - Instruction *Param = - new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, - Launch + "_param_" + std::to_string(Index), - EntryBlock->getTerminator()); - Builder.CreateStore(DevArray, Param); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); - } - Index++; - } - - int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); - - for (long i = 0; i < NumHostIters; i++) { - isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); - Value *Val = IDToValue[Id]; - isl_id_free(Id); - - if (Runtime == GPURuntime::OpenCL) - ArgSizes[Index] = computeSizeInBytes(Val->getType()); - - Instruction *Param = - new AllocaInst(Val->getType(), AddressSpace, - Launch + "_param_" + std::to_string(Index), - EntryBlock->getTerminator()); - Builder.CreateStore(Val, Param); - insertStoreParameter(ArrayTy, Parameters, Param, Index); - Index++; - } - - int NumVars = isl_space_dim(Kernel->space, isl_dim_param); - - for (long i = 0; i < NumVars; i++) { - isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); - Value *Val = IDToValue[Id]; - if (ValueMap.count(Val)) - Val = ValueMap[Val]; - isl_id_free(Id); - - if (Runtime == GPURuntime::OpenCL) - ArgSizes[Index] = computeSizeInBytes(Val->getType()); - - Instruction *Param = - new AllocaInst(Val->getType(), AddressSpace, - Launch + "_param_" + std::to_string(Index), - EntryBlock->getTerminator()); - Builder.CreateStore(Val, Param); - insertStoreParameter(ArrayTy, Parameters, Param, Index); - Index++; - } - - for (auto Val : SubtreeValues) { - if (Runtime == GPURuntime::OpenCL) - ArgSizes[Index] = computeSizeInBytes(Val->getType()); - - Instruction *Param = - new AllocaInst(Val->getType(), AddressSpace, - Launch + "_param_" + std::to_string(Index), - EntryBlock->getTerminator()); - Builder.CreateStore(Val, Param); - insertStoreParameter(ArrayTy, Parameters, Param, Index); - Index++; - } - - if (Runtime == GPURuntime::OpenCL) { - for (int i = 0; i < NumArgs; i++) { - Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); - Instruction *Param = - new AllocaInst(Builder.getInt32Ty(), AddressSpace, - Launch + "_param_size_" + std::to_string(i), - EntryBlock->getTerminator()); - Builder.CreateStore(Val, Param); - insertStoreParameter(ArrayTy, Parameters, Param, Index); - Index++; - } - } - - auto Location = EntryBlock->getTerminator(); - return new BitCastInst(Parameters, Builder.getInt8PtrTy(), - Launch + "_params_i8ptr", Location); -} - -void GPUNodeBuilder::setupKernelSubtreeFunctions( - SetVector SubtreeFunctions) { - for (auto Fn : SubtreeFunctions) { - const std::string ClonedFnName = Fn->getName().str(); - Function *Clone = GPUModule->getFunction(ClonedFnName); - if (!Clone) - Clone = - Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage, - ClonedFnName, GPUModule.get()); - assert(Clone && "Expected cloned function to be initialized."); - assert(ValueMap.find(Fn) == ValueMap.end() && - "Fn already present in ValueMap"); - ValueMap[Fn] = Clone; - } -} -void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { - isl_id *Id = isl_ast_node_get_annotation(KernelStmt); - ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); - isl_id_free(Id); - isl_ast_node_free(KernelStmt); - - if (Kernel->n_grid > 1) - DeepestParallel = std::max( - DeepestParallel, (unsigned)isl_space_dim(Kernel->space, isl_dim_set)); - else - DeepestSequential = std::max( - DeepestSequential, (unsigned)isl_space_dim(Kernel->space, isl_dim_set)); - - Value *BlockDimX, *BlockDimY, *BlockDimZ; - std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); - - SetVector SubtreeValues; - SetVector SubtreeFunctions; - SetVector Loops; - isl::space ParamSpace; - std::tie(SubtreeValues, SubtreeFunctions, Loops, ParamSpace) = - getReferencesInKernel(Kernel); - - // Add parameters that appear only in the access function to the kernel - // space. This is important to make sure that all isl_ids are passed as - // parameters to the kernel, even though we may not have all parameters - // in the context to improve compile time. - Kernel->space = isl_space_align_params(Kernel->space, ParamSpace.release()); - - assert(Kernel->tree && "Device AST of kernel node is empty"); - - Instruction &HostInsertPoint = *Builder.GetInsertPoint(); - IslExprBuilder::IDToValueTy HostIDs = IDToValue; - ValueMapT HostValueMap = ValueMap; - BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; - ScalarMap.clear(); - BlockGenerator::EscapeUsersAllocaMapTy HostEscapeMap = EscapeMap; - EscapeMap.clear(); - - // Create for all loops we depend on values that contain the current loop - // iteration. These values are necessary to generate code for SCEVs that - // depend on such loops. As a result we need to pass them to the subfunction. - for (const Loop *L : Loops) { - const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), - SE.getUnknown(Builder.getInt64(1)), - L, SCEV::FlagAnyWrap); - Value *V = generateSCEV(OuterLIV); - OutsideLoopIterations[L] = SE.getUnknown(V); - SubtreeValues.insert(V); - } - - createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); - setupKernelSubtreeFunctions(SubtreeFunctions); - - create(isl_ast_node_copy(Kernel->tree)); - - finalizeKernelArguments(Kernel); - Function *F = Builder.GetInsertBlock()->getParent(); - if (Arch == GPUArch::NVPTX64) - addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); - clearDominators(F); - clearScalarEvolution(F); - clearLoops(F); - - IDToValue = HostIDs; - - ValueMap = std::move(HostValueMap); - ScalarMap = std::move(HostScalarMap); - EscapeMap = std::move(HostEscapeMap); - IDToSAI.clear(); - Annotator.resetAlternativeAliasBases(); - for (auto &BasePtr : LocalArrays) - S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); - LocalArrays.clear(); - - std::string ASMString = finalizeKernelFunction(); - Builder.SetInsertPoint(&HostInsertPoint); - Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); - - std::string Name = getKernelFuncName(Kernel->id); - Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); - Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); - Value *GPUKernel = createCallGetKernel(KernelString, NameString); - - Value *GridDimX, *GridDimY; - std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); - - createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, - BlockDimZ, Parameters); - createCallFreeKernel(GPUKernel); - - for (auto Id : KernelIds) - isl_id_free(Id); - - KernelIds.clear(); -} - -/// Compute the DataLayout string for the NVPTX backend. -/// -/// @param is64Bit Are we looking for a 64 bit architecture? -static std::string computeNVPTXDataLayout(bool is64Bit) { - std::string Ret = ""; - - if (!is64Bit) { - Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" - "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" - "64-v128:128:128-n16:32:64"; - } else { - Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" - "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" - "64-v128:128:128-n16:32:64"; - } - - return Ret; -} - -/// Compute the DataLayout string for a SPIR kernel. -/// -/// @param is64Bit Are we looking for a 64 bit architecture? -static std::string computeSPIRDataLayout(bool is64Bit) { - std::string Ret = ""; - - if (!is64Bit) { - Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" - "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" - "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" - "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; - } else { - Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" - "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" - "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" - "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; - } - - return Ret; -} - -Function * -GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, - SetVector &SubtreeValues) { - std::vector Args; - std::string Identifier = getKernelFuncName(Kernel->id); - - std::vector MemoryType; - - for (long i = 0; i < Prog->n_array; i++) { - if (!ppcg_kernel_requires_array_argument(Kernel, i)) - continue; - - if (gpu_array_is_read_only_scalar(&Prog->array[i])) { - isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); - const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); - Args.push_back(SAI->getElementType()); - MemoryType.push_back( - ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); - } else { - static const int UseGlobalMemory = 1; - Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); - MemoryType.push_back( - ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1))); - } - } - - int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); - - for (long i = 0; i < NumHostIters; i++) { - Args.push_back(Builder.getInt64Ty()); - MemoryType.push_back( - ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); - } - - int NumVars = isl_space_dim(Kernel->space, isl_dim_param); - - for (long i = 0; i < NumVars; i++) { - isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); - Value *Val = IDToValue[Id]; - isl_id_free(Id); - Args.push_back(Val->getType()); - MemoryType.push_back( - ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); - } - - for (auto *V : SubtreeValues) { - Args.push_back(V->getType()); - MemoryType.push_back( - ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); - } - - auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); - auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, - GPUModule.get()); - - std::vector EmptyStrings; - - for (unsigned int i = 0; i < MemoryType.size(); i++) { - EmptyStrings.push_back(MDString::get(FN->getContext(), "")); - } - - if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { - FN->setMetadata("kernel_arg_addr_space", - MDNode::get(FN->getContext(), MemoryType)); - FN->setMetadata("kernel_arg_name", - MDNode::get(FN->getContext(), EmptyStrings)); - FN->setMetadata("kernel_arg_access_qual", - MDNode::get(FN->getContext(), EmptyStrings)); - FN->setMetadata("kernel_arg_type", - MDNode::get(FN->getContext(), EmptyStrings)); - FN->setMetadata("kernel_arg_type_qual", - MDNode::get(FN->getContext(), EmptyStrings)); - FN->setMetadata("kernel_arg_base_type", - MDNode::get(FN->getContext(), EmptyStrings)); - } - - switch (Arch) { - case GPUArch::NVPTX64: - FN->setCallingConv(CallingConv::PTX_Kernel); - break; - case GPUArch::SPIR32: - case GPUArch::SPIR64: - FN->setCallingConv(CallingConv::SPIR_KERNEL); - break; - } - - auto Arg = FN->arg_begin(); - for (long i = 0; i < Kernel->n_array; i++) { - if (!ppcg_kernel_requires_array_argument(Kernel, i)) - continue; - - Arg->setName(Kernel->array[i].array->name); - - isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); - const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id)); - Type *EleTy = SAI->getElementType(); - Value *Val = &*Arg; - SmallVector Sizes; - isl_ast_build *Build = - isl_ast_build_from_context(isl_set_copy(Prog->context)); - Sizes.push_back(nullptr); - for (long j = 1, n = Kernel->array[i].array->n_index; j < n; j++) { - isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( - Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j)); - auto V = ExprBuilder.create(DimSize); - Sizes.push_back(SE.getSCEV(V)); - } - const ScopArrayInfo *SAIRep = - S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); - LocalArrays.push_back(Val); - - isl_ast_build_free(Build); - KernelIds.push_back(Id); - IDToSAI[Id] = SAIRep; - Arg++; - } - - for (long i = 0; i < NumHostIters; i++) { - isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); - Arg->setName(isl_id_get_name(Id)); - IDToValue[Id] = &*Arg; - KernelIDs.insert(std::unique_ptr(Id)); - Arg++; - } - - for (long i = 0; i < NumVars; i++) { - isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); - Arg->setName(isl_id_get_name(Id)); - Value *Val = IDToValue[Id]; - ValueMap[Val] = &*Arg; - IDToValue[Id] = &*Arg; - KernelIDs.insert(std::unique_ptr(Id)); - Arg++; - } - - for (auto *V : SubtreeValues) { - Arg->setName(V->getName()); - ValueMap[V] = &*Arg; - Arg++; - } - - return FN; -} - -void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { - Intrinsic::ID IntrinsicsBID[2]; - Intrinsic::ID IntrinsicsTID[3]; - - switch (Arch) { - case GPUArch::SPIR64: - case GPUArch::SPIR32: - llvm_unreachable("Cannot generate NVVM intrinsics for SPIR"); - case GPUArch::NVPTX64: - IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; - IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; - - IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; - IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; - IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; - break; - } - - auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { - std::string Name = isl_id_get_name(Id); - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); - Value *Val = Builder.CreateCall(IntrinsicFn, {}); - Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); - IDToValue[Id] = Val; - KernelIDs.insert(std::unique_ptr(Id)); - }; - - for (int i = 0; i < Kernel->n_grid; ++i) { - isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); - addId(Id, IntrinsicsBID[i]); - } - - for (int i = 0; i < Kernel->n_block; ++i) { - isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); - addId(Id, IntrinsicsTID[i]); - } -} - -void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel, - bool SizeTypeIs64bit) { - const char *GroupName[3] = {"__gen_ocl_get_group_id0", - "__gen_ocl_get_group_id1", - "__gen_ocl_get_group_id2"}; - - const char *LocalName[3] = {"__gen_ocl_get_local_id0", - "__gen_ocl_get_local_id1", - "__gen_ocl_get_local_id2"}; - IntegerType *SizeT = - SizeTypeIs64bit ? Builder.getInt64Ty() : Builder.getInt32Ty(); - - auto createFunc = [this](const char *Name, __isl_take isl_id *Id, - IntegerType *SizeT) mutable { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *FN = M->getFunction(Name); - - // If FN is not available, declare it. - if (!FN) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector Args; - FunctionType *Ty = FunctionType::get(SizeT, Args, false); - FN = Function::Create(Ty, Linkage, Name, M); - FN->setCallingConv(CallingConv::SPIR_FUNC); - } - - Value *Val = Builder.CreateCall(FN, {}); - if (SizeT == Builder.getInt32Ty()) - Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); - IDToValue[Id] = Val; - KernelIDs.insert(std::unique_ptr(Id)); - }; - - for (int i = 0; i < Kernel->n_grid; ++i) - createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i), SizeT); - - for (int i = 0; i < Kernel->n_block; ++i) - createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i), SizeT); -} - -void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { - auto Arg = FN->arg_begin(); - for (long i = 0; i < Kernel->n_array; i++) { - if (!ppcg_kernel_requires_array_argument(Kernel, i)) - continue; - - isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); - const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id)); - isl_id_free(Id); - - if (SAI->getNumberOfDimensions() > 0) { - Arg++; - continue; - } - - Value *Val = &*Arg; - - if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { - Type *TypePtr = SAI->getElementType()->getPointerTo(); - Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); - Val = Builder.CreateLoad(SAI->getElementType(), TypedArgPtr); - } - - Value *Alloca = BlockGen.getOrCreateAlloca(SAI); - Builder.CreateStore(Val, Alloca); - - Arg++; - } -} - -void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { - auto *FN = Builder.GetInsertBlock()->getParent(); - auto Arg = FN->arg_begin(); - - bool StoredScalar = false; - for (long i = 0; i < Kernel->n_array; i++) { - if (!ppcg_kernel_requires_array_argument(Kernel, i)) - continue; - - isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); - const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id)); - isl_id_free(Id); - - if (SAI->getNumberOfDimensions() > 0) { - Arg++; - continue; - } - - if (gpu_array_is_read_only_scalar(&Prog->array[i])) { - Arg++; - continue; - } - - Value *Alloca = BlockGen.getOrCreateAlloca(SAI); - Value *ArgPtr = &*Arg; - Type *TypePtr = SAI->getElementType()->getPointerTo(); - Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); - Value *Val = Builder.CreateLoad(SAI->getElementType(), Alloca); - Builder.CreateStore(Val, TypedArgPtr); - StoredScalar = true; - - Arg++; - } - - if (StoredScalar) { - /// In case more than one thread contains scalar stores, the generated - /// code might be incorrect, if we only store at the end of the kernel. - /// To support this case we need to store these scalars back at each - /// memory store or at least before each kernel barrier. - if (Kernel->n_block != 0 || Kernel->n_grid != 0) { - BuildSuccessful = 0; - LLVM_DEBUG( - dbgs() << getUniqueScopName(&S) - << " has a store to a scalar value that" - " would be undefined to run in parallel. Bailing out.\n";); - } - } -} - -void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - - for (int i = 0; i < Kernel->n_var; ++i) { - struct ppcg_kernel_var &Var = Kernel->var[i]; - isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); - Type *EleTy = ScopArrayInfo::getFromId(isl::manage(Id))->getElementType(); - - Type *ArrayTy = EleTy; - SmallVector Sizes; - - Sizes.push_back(nullptr); - for (unsigned int j = 1; j < Var.array->n_index; ++j) { - isl_val *Val = isl_vec_get_element_val(Var.size, j); - long Bound = isl_val_get_num_si(Val); - isl_val_free(Val); - Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); - } - - for (int j = Var.array->n_index - 1; j >= 0; --j) { - isl_val *Val = isl_vec_get_element_val(Var.size, j); - long Bound = isl_val_get_num_si(Val); - isl_val_free(Val); - ArrayTy = ArrayType::get(ArrayTy, Bound); - } - - const ScopArrayInfo *SAI; - Value *Allocation; - if (Var.type == ppcg_access_shared) { - auto GlobalVar = new GlobalVariable( - *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, - nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); - GlobalVar->setAlignment(llvm::Align(EleTy->getPrimitiveSizeInBits() / 8)); - GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); - - Allocation = GlobalVar; - } else if (Var.type == ppcg_access_private) { - Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); - } else { - llvm_unreachable("unknown variable type"); - } - SAI = - S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); - Id = isl_id_alloc(S.getIslCtx().get(), Var.name, nullptr); - IDToValue[Id] = Allocation; - LocalArrays.push_back(Allocation); - KernelIds.push_back(Id); - IDToSAI[Id] = SAI; - } -} - -void GPUNodeBuilder::createKernelFunction( - ppcg_kernel *Kernel, SetVector &SubtreeValues, - SetVector &SubtreeFunctions) { - std::string Identifier = getKernelFuncName(Kernel->id); - GPUModule.reset(new Module(Identifier, Builder.getContext())); - - switch (Arch) { - case GPUArch::NVPTX64: - if (Runtime == GPURuntime::CUDA) - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); - else if (Runtime == GPURuntime::OpenCL) - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); - GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); - break; - case GPUArch::SPIR32: - GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); - GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); - break; - case GPUArch::SPIR64: - GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown")); - GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */)); - break; - } - - Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); - - BasicBlock *PrevBlock = Builder.GetInsertBlock(); - auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); - - DT.addNewBlock(EntryBlock, PrevBlock); - - Builder.SetInsertPoint(EntryBlock); - Builder.CreateRetVoid(); - Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); - - ScopDetection::markFunctionAsInvalid(FN); - - prepareKernelArguments(Kernel, FN); - createKernelVariables(Kernel, FN); - - switch (Arch) { - case GPUArch::NVPTX64: - insertKernelIntrinsics(Kernel); - break; - case GPUArch::SPIR32: - insertKernelCallsSPIR(Kernel, false); - break; - case GPUArch::SPIR64: - insertKernelCallsSPIR(Kernel, true); - break; - } -} - -std::string GPUNodeBuilder::createKernelASM() { - llvm::Triple GPUTriple; - - switch (Arch) { - case GPUArch::NVPTX64: - switch (Runtime) { - case GPURuntime::CUDA: - GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); - break; - case GPURuntime::OpenCL: - GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); - break; - } - break; - case GPUArch::SPIR64: - case GPUArch::SPIR32: - std::string SPIRAssembly; - raw_string_ostream IROstream(SPIRAssembly); - IROstream << *GPUModule; - IROstream.flush(); - return SPIRAssembly; - } - - std::string ErrMsg; - auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); - - if (!GPUTarget) { - errs() << ErrMsg << "\n"; - return ""; - } - - TargetOptions Options; - Options.UnsafeFPMath = FastMath; - - std::string subtarget; - - switch (Arch) { - case GPUArch::NVPTX64: - subtarget = CudaVersion; - break; - case GPUArch::SPIR32: - case GPUArch::SPIR64: - llvm_unreachable("No subtarget for SPIR architecture"); - } - - std::unique_ptr TargetM(GPUTarget->createTargetMachine( - GPUTriple.getTriple(), subtarget, "", Options, std::nullopt)); - - SmallString<0> ASMString; - raw_svector_ostream ASMStream(ASMString); - llvm::legacy::PassManager PM; - - PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); - - if (TargetM->addPassesToEmitFile(PM, ASMStream, nullptr, CGFT_AssemblyFile, - true /* verify */)) { - errs() << "The target does not support generation of this file type!\n"; - return ""; - } - - PM.run(*GPUModule); - - return ASMStream.str().str(); -} - -bool GPUNodeBuilder::requiresCUDALibDevice() { - bool RequiresLibDevice = false; - for (Function &F : GPUModule->functions()) { - if (!F.isDeclaration()) - continue; - - const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(F.getName()); - if (CUDALibDeviceFunc.length() != 0) { - // We need to handle the case where a module looks like this: - // @expf(..) - // @llvm.exp.f64(..) - // Both of these functions would be renamed to `__nv_expf`. - // - // So, we must first check for the existence of the libdevice function. - // If this exists, we replace our current function with it. - // - // If it does not exist, we rename the current function to the - // libdevice functiono name. - if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc)) - F.replaceAllUsesWith(Replacement); - else - F.setName(CUDALibDeviceFunc); - RequiresLibDevice = true; - } - } - - return RequiresLibDevice; -} - -void GPUNodeBuilder::addCUDALibDevice() { - if (Arch != GPUArch::NVPTX64) - return; - - if (requiresCUDALibDevice()) { - SMDiagnostic Error; - - errs() << CUDALibDevice << "\n"; - auto LibDeviceModule = - parseIRFile(CUDALibDevice, Error, GPUModule->getContext()); - - if (!LibDeviceModule) { - BuildSuccessful = false; - report_fatal_error("Could not find or load libdevice. Skipping GPU " - "kernel generation. Please set -polly-acc-libdevice " - "accordingly.\n"); - return; - } - - Linker L(*GPUModule); - - // Set an nvptx64 target triple to avoid linker warnings. The original - // triple of the libdevice files are nvptx-unknown-unknown. - LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); - L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded); - } -} - -std::string GPUNodeBuilder::finalizeKernelFunction() { - - if (verifyModule(*GPUModule)) { - LLVM_DEBUG(dbgs() << "verifyModule failed on module:\n"; - GPUModule->print(dbgs(), nullptr); dbgs() << "\n";); - LLVM_DEBUG(dbgs() << "verifyModule Error:\n"; - verifyModule(*GPUModule, &dbgs());); - - if (FailOnVerifyModuleFailure) - llvm_unreachable("VerifyModule failed."); - - BuildSuccessful = false; - return ""; - } - - addCUDALibDevice(); - - if (DumpKernelIR) - outs() << *GPUModule << "\n"; - - if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) { - // Optimize module. - llvm::legacy::PassManager OptPasses; - PassManagerBuilder PassBuilder; - PassBuilder.OptLevel = 3; - PassBuilder.SizeLevel = 0; - PassBuilder.populateModulePassManager(OptPasses); - OptPasses.run(*GPUModule); - } - - std::string Assembly = createKernelASM(); - - if (DumpKernelASM) - outs() << Assembly << "\n"; - - GPUModule.release(); - KernelIDs.clear(); - - return Assembly; -} -/// Construct an `isl_pw_aff_list` from a vector of `isl_pw_aff` -/// @param PwAffs The list of piecewise affine functions to create an -/// `isl_pw_aff_list` from. We expect an rvalue ref because -/// all the isl_pw_aff are used up by this function. -/// -/// @returns The `isl_pw_aff_list`. -__isl_give isl_pw_aff_list * -createPwAffList(isl_ctx *Context, - const std::vector<__isl_take isl_pw_aff *> &&PwAffs) { - isl_pw_aff_list *List = isl_pw_aff_list_alloc(Context, PwAffs.size()); - - for (unsigned i = 0; i < PwAffs.size(); i++) { - List = isl_pw_aff_list_insert(List, i, PwAffs[i]); - } - return List; -} - -/// Align all the `PwAffs` such that they have the same parameter dimensions. -/// -/// We loop over all `pw_aff` and align all of their spaces together to -/// create a common space for all the `pw_aff`. This common space is the -/// `AlignSpace`. We then align all the `pw_aff` to this space. We start -/// with the given `SeedSpace`. -/// @param PwAffs The list of piecewise affine functions we want to align. -/// This is an rvalue reference because the entire vector is -/// used up by the end of the operation. -/// @param SeedSpace The space to start the alignment process with. -/// @returns A std::pair, whose first element is the aligned space, -/// whose second element is the vector of aligned piecewise -/// affines. -static std::pair<__isl_give isl_space *, std::vector<__isl_give isl_pw_aff *>> -alignPwAffs(const std::vector<__isl_take isl_pw_aff *> &&PwAffs, - __isl_take isl_space *SeedSpace) { - assert(SeedSpace && "Invalid seed space given."); - - isl_space *AlignSpace = SeedSpace; - for (isl_pw_aff *PwAff : PwAffs) { - isl_space *PwAffSpace = isl_pw_aff_get_domain_space(PwAff); - AlignSpace = isl_space_align_params(AlignSpace, PwAffSpace); - } - std::vector AdjustedPwAffs; - - for (unsigned i = 0; i < PwAffs.size(); i++) { - isl_pw_aff *Adjusted = PwAffs[i]; - assert(Adjusted && "Invalid pw_aff given."); - Adjusted = isl_pw_aff_align_params(Adjusted, isl_space_copy(AlignSpace)); - AdjustedPwAffs.push_back(Adjusted); - } - return std::make_pair(AlignSpace, AdjustedPwAffs); -} - -namespace { -class PPCGCodeGeneration final : public ScopPass { -public: - static char ID; - - GPURuntime Runtime = GPURuntime::CUDA; - - GPUArch Architecture = GPUArch::NVPTX64; - - /// The scop that is currently processed. - Scop *S; - - LoopInfo *LI; - DominatorTree *DT; - ScalarEvolution *SE; - const DataLayout *DL; - RegionInfo *RI; - - PPCGCodeGeneration() : ScopPass(ID) { - // Apply defaults. - Runtime = GPURuntimeChoice; - Architecture = GPUArchChoice; - } - - /// Construct compilation options for PPCG. - /// - /// @returns The compilation options. - ppcg_options *createPPCGOptions() { - auto DebugOptions = - (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); - auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); - - DebugOptions->dump_schedule_constraints = false; - DebugOptions->dump_schedule = false; - DebugOptions->dump_final_schedule = false; - DebugOptions->dump_sizes = false; - DebugOptions->verbose = false; - - Options->debug = DebugOptions; - - Options->group_chains = false; - Options->reschedule = true; - Options->scale_tile_loops = false; - Options->wrap = false; - - Options->non_negative_parameters = false; - Options->ctx = nullptr; - Options->sizes = nullptr; - - Options->tile = true; - Options->tile_size = 32; - - Options->isolate_full_tiles = false; - - Options->use_private_memory = PrivateMemory; - Options->use_shared_memory = SharedMemory; - Options->max_shared_memory = 48 * 1024; - - Options->target = PPCG_TARGET_CUDA; - Options->openmp = false; - Options->linearize_device_arrays = true; - Options->allow_gnu_extensions = false; - - Options->unroll_copy_shared = false; - Options->unroll_gpu_tile = false; - Options->live_range_reordering = true; - - Options->live_range_reordering = true; - Options->hybrid = false; - Options->opencl_compiler_options = nullptr; - Options->opencl_use_gpu = false; - Options->opencl_n_include_file = 0; - Options->opencl_include_files = nullptr; - Options->opencl_print_kernel_types = false; - Options->opencl_embed_kernel_code = false; - - Options->save_schedule_file = nullptr; - Options->load_schedule_file = nullptr; - - return Options; - } - - /// Get a tagged access relation containing all accesses of type @p AccessTy. - /// - /// Instead of a normal access of the form: - /// - /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] - /// - /// a tagged access has the form - /// - /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] - /// - /// where 'id' is an additional space that references the memory access that - /// triggered the access. - /// - /// @param AccessTy The type of the memory accesses to collect. - /// - /// @return The relation describing all tagged memory accesses. - isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { - isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace().release()); - - for (auto &Stmt : *S) - for (auto &Acc : Stmt) - if (Acc->getType() == AccessTy) { - isl_map *Relation = Acc->getAccessRelation().release(); - Relation = - isl_map_intersect_domain(Relation, Stmt.getDomain().release()); - - isl_space *Space = isl_map_get_space(Relation); - Space = isl_space_range(Space); - Space = isl_space_from_range(Space); - Space = - isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release()); - isl_map *Universe = isl_map_universe(Space); - Relation = isl_map_domain_product(Relation, Universe); - Accesses = isl_union_map_add_map(Accesses, Relation); - } - - return Accesses; - } - - /// Get the set of all read accesses, tagged with the access id. - /// - /// @see getTaggedAccesses - isl_union_map *getTaggedReads() { - return getTaggedAccesses(MemoryAccess::READ); - } - - /// Get the set of all may (and must) accesses, tagged with the access id. - /// - /// @see getTaggedAccesses - isl_union_map *getTaggedMayWrites() { - return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), - getTaggedAccesses(MemoryAccess::MUST_WRITE)); - } - - /// Get the set of all must accesses, tagged with the access id. - /// - /// @see getTaggedAccesses - isl_union_map *getTaggedMustWrites() { - return getTaggedAccesses(MemoryAccess::MUST_WRITE); - } - - /// Collect parameter and array names as isl_ids. - /// - /// To reason about the different parameters and arrays used, ppcg requires - /// a list of all isl_ids in use. As PPCG traditionally performs - /// source-to-source compilation each of these isl_ids is mapped to the - /// expression that represents it. As we do not have a corresponding - /// expression in Polly, we just map each id to a 'zero' expression to match - /// the data format that ppcg expects. - /// - /// @returns Retun a map from collected ids to 'zero' ast expressions. - __isl_give isl_id_to_ast_expr *getNames() { - auto *Names = isl_id_to_ast_expr_alloc( - S->getIslCtx().get(), - S->getNumParams() + std::distance(S->array_begin(), S->array_end())); - auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx().get())); - - for (const SCEV *P : S->parameters()) { - isl_id *Id = S->getIdForParam(P).release(); - Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); - } - - for (auto &Array : S->arrays()) { - auto Id = Array->getBasePtrId().release(); - Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); - } - - isl_ast_expr_free(Zero); - - return Names; - } - - /// Create a new PPCG scop from the current scop. - /// - /// The PPCG scop is initialized with data from the current polly::Scop. From - /// this initial data, the data-dependences in the PPCG scop are initialized. - /// We do not use Polly's dependence analysis for now, to ensure we match - /// the PPCG default behaviour more closely. - /// - /// @returns A new ppcg scop. - ppcg_scop *createPPCGScop() { - MustKillsInfo KillsInfo = computeMustKillsInfo(*S); - - auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); - - PPCGScop->options = createPPCGOptions(); - // enable live range reordering - PPCGScop->options->live_range_reordering = 1; - - PPCGScop->start = 0; - PPCGScop->end = 0; - - PPCGScop->context = S->getContext().release(); - PPCGScop->domain = S->getDomains().release(); - // TODO: investigate this further. PPCG calls collect_call_domains. - PPCGScop->call = isl_union_set_from_set(S->getContext().release()); - PPCGScop->tagged_reads = getTaggedReads(); - PPCGScop->reads = S->getReads().release(); - PPCGScop->live_in = nullptr; - PPCGScop->tagged_may_writes = getTaggedMayWrites(); - PPCGScop->may_writes = S->getWrites().release(); - PPCGScop->tagged_must_writes = getTaggedMustWrites(); - PPCGScop->must_writes = S->getMustWrites().release(); - PPCGScop->live_out = nullptr; - PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.release(); - PPCGScop->must_kills = KillsInfo.MustKills.release(); - - PPCGScop->tagger = nullptr; - PPCGScop->independence = - isl_union_map_empty(isl_set_get_space(PPCGScop->context)); - PPCGScop->dep_flow = nullptr; - PPCGScop->tagged_dep_flow = nullptr; - PPCGScop->dep_false = nullptr; - PPCGScop->dep_forced = nullptr; - PPCGScop->dep_order = nullptr; - PPCGScop->tagged_dep_order = nullptr; - - PPCGScop->schedule = S->getScheduleTree().release(); - // If we have something non-trivial to kill, add it to the schedule - if (KillsInfo.KillsSchedule.get()) - PPCGScop->schedule = isl_schedule_sequence( - PPCGScop->schedule, KillsInfo.KillsSchedule.release()); - - PPCGScop->names = getNames(); - PPCGScop->pet = nullptr; - - compute_tagger(PPCGScop); - compute_dependences(PPCGScop); - eliminate_dead_code(PPCGScop); - - return PPCGScop; - } - - /// Collect the array accesses in a statement. - /// - /// @param Stmt The statement for which to collect the accesses. - /// - /// @returns A list of array accesses. - gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { - gpu_stmt_access *Accesses = nullptr; - - for (MemoryAccess *Acc : Stmt) { - auto Access = - isl_alloc_type(S->getIslCtx().get(), struct gpu_stmt_access); - Access->read = Acc->isRead(); - Access->write = Acc->isWrite(); - Access->access = Acc->getAccessRelation().release(); - isl_space *Space = isl_map_get_space(Access->access); - Space = isl_space_range(Space); - Space = isl_space_from_range(Space); - Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release()); - isl_map *Universe = isl_map_universe(Space); - Access->tagged_access = - isl_map_domain_product(Acc->getAccessRelation().release(), Universe); - Access->exact_write = !Acc->isMayWrite(); - Access->ref_id = Acc->getId().release(); - Access->next = Accesses; - Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); - // TODO: Also mark one-element accesses to arrays as fixed-element. - Access->fixed_element = - Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false; - Accesses = Access; - } - - return Accesses; - } - - /// Collect the list of GPU statements. - /// - /// Each statement has an id, a pointer to the underlying data structure, - /// as well as a list with all memory accesses. - /// - /// TODO: Initialize the list of memory accesses. - /// - /// @returns A linked-list of statements. - gpu_stmt *getStatements() { - gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx().get(), struct gpu_stmt, - std::distance(S->begin(), S->end())); - - int i = 0; - for (auto &Stmt : *S) { - gpu_stmt *GPUStmt = &Stmts[i]; - - GPUStmt->id = Stmt.getDomainId().release(); - - // We use the pet stmt pointer to keep track of the Polly statements. - GPUStmt->stmt = (pet_stmt *)&Stmt; - GPUStmt->accesses = getStmtAccesses(Stmt); - i++; - } - - return Stmts; - } - - /// Derive the extent of an array. - /// - /// The extent of an array is the set of elements that are within the - /// accessed array. For the inner dimensions, the extent constraints are - /// 0 and the size of the corresponding array dimension. For the first - /// (outermost) dimension, the extent constraints are the minimal and maximal - /// subscript value for the first dimension. - /// - /// @param Array The array to derive the extent for. - /// - /// @returns An isl_set describing the extent of the array. - isl::set getExtent(ScopArrayInfo *Array) { - unsigned NumDims = Array->getNumberOfDimensions(); - - if (Array->getNumberOfDimensions() == 0) - return isl::set::universe(Array->getSpace()); - - isl::union_map Accesses = S->getAccesses(Array); - isl::union_set AccessUSet = Accesses.range(); - AccessUSet = AccessUSet.coalesce(); - AccessUSet = AccessUSet.detect_equalities(); - AccessUSet = AccessUSet.coalesce(); - - if (AccessUSet.is_empty()) - return isl::set::empty(Array->getSpace()); - - isl::set AccessSet = AccessUSet.extract_set(Array->getSpace()); - - isl::local_space LS = isl::local_space(Array->getSpace()); - - isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, 0); - isl::pw_aff OuterMin = AccessSet.dim_min(0); - isl::pw_aff OuterMax = AccessSet.dim_max(0); - OuterMin = OuterMin.add_dims(isl::dim::in, - unsignedFromIslSize(Val.dim(isl::dim::in))); - OuterMax = OuterMax.add_dims(isl::dim::in, - unsignedFromIslSize(Val.dim(isl::dim::in))); - OuterMin = OuterMin.set_tuple_id(isl::dim::in, Array->getBasePtrId()); - OuterMax = OuterMax.set_tuple_id(isl::dim::in, Array->getBasePtrId()); - - isl::set Extent = isl::set::universe(Array->getSpace()); - - Extent = Extent.intersect(OuterMin.le_set(Val)); - Extent = Extent.intersect(OuterMax.ge_set(Val)); - - for (unsigned i = 1; i < NumDims; ++i) - Extent = Extent.lower_bound_si(isl::dim::set, i, 0); - - for (unsigned i = 0; i < NumDims; ++i) { - isl::pw_aff PwAff = Array->getDimensionSizePw(i); - - // isl_pw_aff can be NULL for zero dimension. Only in the case of a - // Fortran array will we have a legitimate dimension. - if (PwAff.is_null()) { - assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension"); - continue; - } - - isl::pw_aff Val = isl::aff::var_on_domain( - isl::local_space(Array->getSpace()), isl::dim::set, i); - PwAff = PwAff.add_dims(isl::dim::in, - unsignedFromIslSize(Val.dim(isl::dim::in))); - PwAff = PwAff.set_tuple_id(isl::dim::in, Val.get_tuple_id(isl::dim::in)); - isl::set Set = PwAff.gt_set(Val); - Extent = Set.intersect(Extent); - } - - return Extent; - } - - /// Derive the bounds of an array. - /// - /// For the first dimension we derive the bound of the array from the extent - /// of this dimension. For inner dimensions we obtain their size directly from - /// ScopArrayInfo. - /// - /// @param PPCGArray The array to compute bounds for. - /// @param Array The polly array from which to take the information. - void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { - std::vector Bounds; - - if (PPCGArray.n_index > 0) { - if (isl_set_is_empty(PPCGArray.extent)) { - isl_set *Dom = isl_set_copy(PPCGArray.extent); - isl_local_space *LS = isl_local_space_from_space( - isl_space_params(isl_set_get_space(Dom))); - isl_set_free(Dom); - isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS)); - Bounds.push_back(Zero); - } else { - isl_set *Dom = isl_set_copy(PPCGArray.extent); - Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); - isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); - isl_set_free(Dom); - Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); - isl_local_space *LS = - isl_local_space_from_space(isl_set_get_space(Dom)); - isl_aff *One = isl_aff_zero_on_domain(LS); - One = isl_aff_add_constant_si(One, 1); - Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); - Bound = isl_pw_aff_gist(Bound, S->getContext().release()); - Bounds.push_back(Bound); - } - } - - for (unsigned i = 1; i < PPCGArray.n_index; ++i) { - isl_pw_aff *Bound = Array->getDimensionSizePw(i).release(); - auto LS = isl_pw_aff_get_domain_space(Bound); - auto Aff = isl_multi_aff_zero(LS); - - // We need types to work out, which is why we perform this weird dance - // with `Aff` and `Bound`. Consider this example: - - // LS: [p] -> { [] } - // Zero: [p] -> { [] } | Implicitly, is [p] -> { ~ -> [] }. - // This `~` is used to denote a "null space" (which is different from - // a *zero dimensional* space), which is something that ISL does not - // show you when pretty printing. - - // Bound: [p] -> { [] -> [(10p)] } | Here, the [] is a *zero dimensional* - // space, not a "null space" which does not exist at all. - - // When we pullback (precompose) `Bound` with `Zero`, we get: - // Bound . Zero = - // ([p] -> { [] -> [(10p)] }) . ([p] -> {~ -> [] }) = - // [p] -> { ~ -> [(10p)] } = - // [p] -> [(10p)] (as ISL pretty prints it) - // Bound Pullback: [p] -> { [(10p)] } - - // We want this kind of an expression for Bound, without a - // zero dimensional input, but with a "null space" input for the types - // to work out later on, as far as I (Siddharth Bhat) understand. - // I was unable to find a reference to this in the ISL manual. - // References: Tobias Grosser. - - Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); - Bounds.push_back(Bound); - } - - /// To construct a `isl_multi_pw_aff`, we need all the indivisual `pw_aff` - /// to have the same parameter dimensions. So, we need to align them to an - /// appropriate space. - /// Scop::Context is _not_ an appropriate space, because when we have - /// `-polly-ignore-parameter-bounds` enabled, the Scop::Context does not - /// contain all parameter dimensions. - /// So, use the helper `alignPwAffs` to align all the `isl_pw_aff` together. - isl_space *SeedAlignSpace = S->getParamSpace().release(); - SeedAlignSpace = isl_space_add_dims(SeedAlignSpace, isl_dim_set, 1); - - isl_space *AlignSpace = nullptr; - std::vector AlignedBounds; - std::tie(AlignSpace, AlignedBounds) = - alignPwAffs(std::move(Bounds), SeedAlignSpace); - - assert(AlignSpace && "alignPwAffs did not initialise AlignSpace"); - - isl_pw_aff_list *BoundsList = - createPwAffList(S->getIslCtx().get(), std::move(AlignedBounds)); - - isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent); - BoundsSpace = isl_space_align_params(BoundsSpace, AlignSpace); - - assert(BoundsSpace && "Unable to access space of array."); - assert(BoundsList && "Unable to access list of bounds."); - - PPCGArray.bound = - isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList); - assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly."); - } - - /// Create the arrays for @p PPCGProg. - /// - /// @param PPCGProg The program to compute the arrays for. - void createArrays(gpu_prog *PPCGProg, - const SmallVector &ValidSAIs) { - int i = 0; - for (auto &Array : ValidSAIs) { - std::string TypeName; - raw_string_ostream OS(TypeName); - - OS << *Array->getElementType(); - TypeName = OS.str(); - - gpu_array_info &PPCGArray = PPCGProg->array[i]; - - PPCGArray.space = Array->getSpace().release(); - PPCGArray.type = strdup(TypeName.c_str()); - PPCGArray.size = DL->getTypeAllocSize(Array->getElementType()); - PPCGArray.name = strdup(Array->getName().c_str()); - PPCGArray.extent = nullptr; - PPCGArray.n_index = Array->getNumberOfDimensions(); - PPCGArray.extent = getExtent(Array).release(); - PPCGArray.n_ref = 0; - PPCGArray.refs = nullptr; - PPCGArray.accessed = true; - PPCGArray.read_only_scalar = - Array->isReadOnly() && Array->getNumberOfDimensions() == 0; - PPCGArray.has_compound_element = false; - PPCGArray.local = false; - PPCGArray.declare_local = false; - PPCGArray.global = false; - PPCGArray.linearize = false; - PPCGArray.dep_order = nullptr; - PPCGArray.user = Array; - - PPCGArray.bound = nullptr; - setArrayBounds(PPCGArray, Array); - i++; - - collect_references(PPCGProg, &PPCGArray); - PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray); - } - } - - /// Create an identity map between the arrays in the scop. - /// - /// @returns An identity map between the arrays in the scop. - isl_union_map *getArrayIdentity() { - isl_union_map *Maps = isl_union_map_empty(S->getParamSpace().release()); - - for (auto &Array : S->arrays()) { - isl_space *Space = Array->getSpace().release(); - Space = isl_space_map_from_set(Space); - isl_map *Identity = isl_map_identity(Space); - Maps = isl_union_map_add_map(Maps, Identity); - } - - return Maps; - } - - /// Create a default-initialized PPCG GPU program. - /// - /// @returns A new gpu program description. - gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { - - if (!PPCGScop) - return nullptr; - - auto PPCGProg = isl_calloc_type(S->getIslCtx().get(), struct gpu_prog); - - PPCGProg->ctx = S->getIslCtx().get(); - PPCGProg->scop = PPCGScop; - PPCGProg->context = isl_set_copy(PPCGScop->context); - PPCGProg->read = isl_union_map_copy(PPCGScop->reads); - PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); - PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); - PPCGProg->tagged_must_kill = - isl_union_map_copy(PPCGScop->tagged_must_kills); - PPCGProg->to_inner = getArrayIdentity(); - PPCGProg->to_outer = getArrayIdentity(); - // TODO: verify that this assignment is correct. - PPCGProg->any_to_outer = nullptr; - PPCGProg->n_stmts = std::distance(S->begin(), S->end()); - PPCGProg->stmts = getStatements(); - - // Only consider arrays that have a non-empty extent. - // Otherwise, this will cause us to consider the following kinds of - // empty arrays: - // 1. Invariant loads that are represented by SAI objects. - // 2. Arrays with statically known zero size. - auto ValidSAIsRange = - make_filter_range(S->arrays(), [this](ScopArrayInfo *SAI) -> bool { - return !getExtent(SAI).is_empty(); - }); - SmallVector ValidSAIs(ValidSAIsRange.begin(), - ValidSAIsRange.end()); - - PPCGProg->n_array = - ValidSAIs.size(); // std::distance(S->array_begin(), S->array_end()); - PPCGProg->array = isl_calloc_array( - S->getIslCtx().get(), struct gpu_array_info, PPCGProg->n_array); - - createArrays(PPCGProg, ValidSAIs); - - PPCGProg->array_order = nullptr; - collect_order_dependences(PPCGProg); - - PPCGProg->may_persist = compute_may_persist(PPCGProg); - return PPCGProg; - } - - struct PrintGPUUserData { - struct cuda_info *CudaInfo; - struct gpu_prog *PPCGProg; - std::vector Kernels; - }; - - /// Print a user statement node in the host code. - /// - /// We use ppcg's printing facilities to print the actual statement and - /// additionally build up a list of all kernels that are encountered in the - /// host ast. - /// - /// @param P The printer to print to - /// @param Options The printing options to use - /// @param Node The node to print - /// @param User A user pointer to carry additional data. This pointer is - /// expected to be of type PrintGPUUserData. - /// - /// @returns A printer to which the output has been printed. - static __isl_give isl_printer * - printHostUser(__isl_take isl_printer *P, - __isl_take isl_ast_print_options *Options, - __isl_take isl_ast_node *Node, void *User) { - auto Data = (struct PrintGPUUserData *)User; - auto Id = isl_ast_node_get_annotation(Node); - - if (Id) { - bool IsUser = !strcmp(isl_id_get_name(Id), "user"); - - // If this is a user statement, format it ourselves as ppcg would - // otherwise try to call pet functionality that is not available in - // Polly. - if (IsUser) { - P = isl_printer_start_line(P); - P = isl_printer_print_ast_node(P, Node); - P = isl_printer_end_line(P); - isl_id_free(Id); - isl_ast_print_options_free(Options); - return P; - } - - auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); - isl_id_free(Id); - Data->Kernels.push_back(Kernel); - } - - return print_host_user(P, Options, Node, User); - } - - /// Print C code corresponding to the control flow in @p Kernel. - /// - /// @param Kernel The kernel to print - void printKernel(ppcg_kernel *Kernel) { - auto *P = isl_printer_to_str(S->getIslCtx().get()); - P = isl_printer_set_output_format(P, ISL_FORMAT_C); - auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get()); - P = isl_ast_node_print(Kernel->tree, P, Options); - char *String = isl_printer_get_str(P); - outs() << String << "\n"; - free(String); - isl_printer_free(P); - } - - /// Print C code corresponding to the GPU code described by @p Tree. - /// - /// @param Tree An AST describing GPU code - /// @param PPCGProg The PPCG program from which @Tree has been constructed. - void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { - auto *P = isl_printer_to_str(S->getIslCtx().get()); - P = isl_printer_set_output_format(P, ISL_FORMAT_C); - - PrintGPUUserData Data; - Data.PPCGProg = PPCGProg; - - auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get()); - Options = - isl_ast_print_options_set_print_user(Options, printHostUser, &Data); - P = isl_ast_node_print(Tree, P, Options); - char *String = isl_printer_get_str(P); - outs() << "# host\n"; - outs() << String << "\n"; - free(String); - isl_printer_free(P); - - for (auto Kernel : Data.Kernels) { - outs() << "# kernel" << Kernel->id << "\n"; - printKernel(Kernel); - } - } - - // Generate a GPU program using PPCG. - // - // GPU mapping consists of multiple steps: - // - // 1) Compute new schedule for the program. - // 2) Map schedule to GPU (TODO) - // 3) Generate code for new schedule (TODO) - // - // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer - // is mostly CPU specific. Instead, we use PPCG's GPU code generation - // strategy directly from this pass. - gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { - - auto PPCGGen = isl_calloc_type(S->getIslCtx().get(), struct gpu_gen); - - PPCGGen->ctx = S->getIslCtx().get(); - PPCGGen->options = PPCGScop->options; - PPCGGen->print = nullptr; - PPCGGen->print_user = nullptr; - PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; - PPCGGen->prog = PPCGProg; - PPCGGen->tree = nullptr; - PPCGGen->types.n = 0; - PPCGGen->types.name = nullptr; - PPCGGen->sizes = nullptr; - PPCGGen->used_sizes = nullptr; - PPCGGen->kernel_id = 0; - - // Set scheduling strategy to same strategy PPCG is using. - isl_options_set_schedule_serialize_sccs(PPCGGen->ctx, false); - isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); - isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); - isl_options_set_schedule_whole_component(PPCGGen->ctx, false); - - isl_schedule *Schedule = get_schedule(PPCGGen); - - int has_permutable = has_any_permutable_node(Schedule); - - Schedule = - isl_schedule_align_params(Schedule, S->getFullParamSpace().release()); - - if (!has_permutable || has_permutable < 0) { - Schedule = isl_schedule_free(Schedule); - LLVM_DEBUG(dbgs() << getUniqueScopName(S) - << " does not have permutable bands. Bailing out\n";); - } else { - const bool CreateTransferToFromDevice = !PollyManagedMemory; - Schedule = map_to_device(PPCGGen, Schedule, CreateTransferToFromDevice); - PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); - } - - if (DumpSchedule) { - isl_printer *P = isl_printer_to_str(S->getIslCtx().get()); - P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); - P = isl_printer_print_str(P, "Schedule\n"); - P = isl_printer_print_str(P, "========\n"); - if (Schedule) - P = isl_printer_print_schedule(P, Schedule); - else - P = isl_printer_print_str(P, "No schedule found\n"); - - outs() << isl_printer_get_str(P) << "\n"; - isl_printer_free(P); - } - - if (DumpCode) { - outs() << "Code\n"; - outs() << "====\n"; - if (PPCGGen->tree) - printGPUTree(PPCGGen->tree, PPCGProg); - else - outs() << "No code generated\n"; - } - - isl_schedule_free(Schedule); - - return PPCGGen; - } - - /// Free gpu_gen structure. - /// - /// @param PPCGGen The ppcg_gen object to free. - void freePPCGGen(gpu_gen *PPCGGen) { - isl_ast_node_free(PPCGGen->tree); - isl_union_map_free(PPCGGen->sizes); - isl_union_map_free(PPCGGen->used_sizes); - free(PPCGGen); - } - - /// Free the options in the ppcg scop structure. - /// - /// ppcg is not freeing these options for us. To avoid leaks we do this - /// ourselves. - /// - /// @param PPCGScop The scop referencing the options to free. - void freeOptions(ppcg_scop *PPCGScop) { - free(PPCGScop->options->debug); - PPCGScop->options->debug = nullptr; - free(PPCGScop->options); - PPCGScop->options = nullptr; - } - - /// Approximate the number of points in the set. - /// - /// This function returns an ast expression that overapproximates the number - /// of points in an isl set through the rectangular hull surrounding this set. - /// - /// @param Set The set to count. - /// @param Build The isl ast build object to use for creating the ast - /// expression. - /// - /// @returns An approximation of the number of points in the set. - __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, - __isl_keep isl_ast_build *Build) { - - isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); - auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); - - isl_space *Space = isl_set_get_space(Set); - Space = isl_space_params(Space); - auto *Univ = isl_set_universe(Space); - isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); - - for (long i = 0, n = isl_set_dim(Set, isl_dim_set); i < n; i++) { - isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); - isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); - isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); - DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); - auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); - Expr = isl_ast_expr_mul(Expr, DimSizeExpr); - } - - isl_set_free(Set); - isl_pw_aff_free(OneAff); - - return Expr; - } - - /// Approximate a number of dynamic instructions executed by a given - /// statement. - /// - /// @param Stmt The statement for which to compute the number of dynamic - /// instructions. - /// @param Build The isl ast build object to use for creating the ast - /// expression. - /// @returns An approximation of the number of dynamic instructions executed - /// by @p Stmt. - __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, - __isl_keep isl_ast_build *Build) { - auto Iterations = approxPointsInSet(Stmt.getDomain().release(), Build); - - long InstCount = 0; - - if (Stmt.isBlockStmt()) { - auto *BB = Stmt.getBasicBlock(); - InstCount = std::distance(BB->begin(), BB->end()); - } else { - auto *R = Stmt.getRegion(); - - for (auto *BB : R->blocks()) { - InstCount += std::distance(BB->begin(), BB->end()); - } - } - - isl_val *InstVal = isl_val_int_from_si(S->getIslCtx().get(), InstCount); - auto *InstExpr = isl_ast_expr_from_val(InstVal); - return isl_ast_expr_mul(InstExpr, Iterations); - } - - /// Approximate dynamic instructions executed in scop. - /// - /// @param S The scop for which to approximate dynamic instructions. - /// @param Build The isl ast build object to use for creating the ast - /// expression. - /// @returns An approximation of the number of dynamic instructions executed - /// in @p S. - __isl_give isl_ast_expr * - getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { - isl_ast_expr *Instructions; - - isl_val *Zero = isl_val_int_from_si(S.getIslCtx().get(), 0); - Instructions = isl_ast_expr_from_val(Zero); - - for (ScopStmt &Stmt : S) { - isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); - Instructions = isl_ast_expr_add(Instructions, StmtInstructions); - } - return Instructions; - } - - /// Create a check that ensures sufficient compute in scop. - /// - /// @param S The scop for which to ensure sufficient compute. - /// @param Build The isl ast build object to use for creating the ast - /// expression. - /// @returns An expression that evaluates to TRUE in case of sufficient - /// compute and to FALSE, otherwise. - __isl_give isl_ast_expr * - createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { - auto Iterations = getNumberOfIterations(S, Build); - auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx().get(), MinCompute); - auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); - return isl_ast_expr_ge(Iterations, MinComputeExpr); - } - - /// Check if the basic block contains a function we cannot codegen for GPU - /// kernels. - /// - /// If this basic block does something with a `Function` other than calling - /// a function that we support in a kernel, return true. - bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB, - bool AllowCUDALibDevice) { - for (const Instruction &Inst : *BB) { - const CallInst *Call = dyn_cast(&Inst); - if (Call && isValidFunctionInKernel(Call->getCalledFunction(), - AllowCUDALibDevice)) - continue; - - for (Value *Op : Inst.operands()) - // Look for functions among operands of Inst. - if (isa(Op->stripPointerCasts())) { - LLVM_DEBUG(dbgs() - << Inst << " has illegal use of function in kernel.\n"); - return true; - } - } - return false; - } - - /// Return whether the Scop S uses functions in a way that we do not support. - bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) { - for (auto &Stmt : S) { - if (Stmt.isBlockStmt()) { - if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(), - AllowCUDALibDevice)) - return true; - } else { - assert(Stmt.isRegionStmt() && - "Stmt was neither block nor region statement"); - for (const BasicBlock *BB : Stmt.getRegion()->blocks()) - if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice)) - return true; - } - } - return false; - } - - /// Generate code for a given GPU AST described by @p Root. - /// - /// @param Root An isl_ast_node pointing to the root of the GPU AST. - /// @param Prog The GPU Program to generate code for. - void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { - ScopAnnotator Annotator; - Annotator.buildAliasScopes(*S); - - Region *R = &S->getRegion(); - - simplifyRegion(R, DT, LI, RI); - - BasicBlock *EnteringBB = R->getEnteringBlock(); - - PollyIRBuilder Builder(EnteringBB->getContext(), ConstantFolder(), - IRInserter(Annotator)); - Builder.SetInsertPoint(EnteringBB->getTerminator()); - - // Only build the run-time condition and parameters _after_ having - // introduced the conditional branch. This is important as the conditional - // branch will guard the original scop from new induction variables that - // the SCEVExpander may introduce while code generating the parameters and - // which may introduce scalar dependences that prevent us from correctly - // code generating this scop. - BBPair StartExitBlocks; - BranchInst *CondBr = nullptr; - std::tie(StartExitBlocks, CondBr) = - executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); - BasicBlock *StartBlock = std::get<0>(StartExitBlocks); - - assert(CondBr && "CondBr not initialized by executeScopConditionally"); - - GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, - StartBlock, Prog, Runtime, Architecture); - - // TODO: Handle LICM - auto SplitBlock = StartBlock->getSinglePredecessor(); - Builder.SetInsertPoint(SplitBlock->getTerminator()); - - isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx().get()); - isl::ast_expr Condition = - IslAst::buildRunCondition(*S, isl::manage_copy(Build)); - isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); - Condition = - isl::manage(isl_ast_expr_and(Condition.release(), SufficientCompute)); - isl_ast_build_free(Build); - - // preload invariant loads. Note: This should happen before the RTC - // because the RTC may depend on values that are invariant load hoisted. - if (!NodeBuilder.preloadInvariantLoads()) { - // Patch the introduced branch condition to ensure that we always execute - // the original SCoP. - auto *FalseI1 = Builder.getFalse(); - auto *SplitBBTerm = Builder.GetInsertBlock()->getTerminator(); - SplitBBTerm->setOperand(0, FalseI1); - - LLVM_DEBUG(dbgs() << "preloading invariant loads failed in function: " + - S->getFunction().getName() + - " | Scop Region: " + S->getNameStr()); - // adjust the dominator tree accordingly. - auto *ExitingBlock = StartBlock->getUniqueSuccessor(); - assert(ExitingBlock); - auto *MergeBlock = ExitingBlock->getUniqueSuccessor(); - assert(MergeBlock); - polly::markBlockUnreachable(*StartBlock, Builder); - polly::markBlockUnreachable(*ExitingBlock, Builder); - auto *ExitingBB = S->getExitingBlock(); - assert(ExitingBB); - - DT->changeImmediateDominator(MergeBlock, ExitingBB); - DT->eraseNode(ExitingBlock); - isl_ast_node_free(Root); - } else { - - if (polly::PerfMonitoring) { - PerfMonitor P(*S, EnteringBB->getParent()->getParent()); - P.initialize(); - P.insertRegionStart(SplitBlock->getTerminator()); - - // TODO: actually think if this is the correct exiting block to place - // the `end` performance marker. Invariant load hoisting changes - // the CFG in a way that I do not precisely understand, so I - // (Siddharth) should come back to this and - // think about which exiting block to use. - auto *ExitingBlock = StartBlock->getUniqueSuccessor(); - assert(ExitingBlock); - BasicBlock *MergeBlock = ExitingBlock->getUniqueSuccessor(); - P.insertRegionEnd(MergeBlock->getTerminator()); - } - - NodeBuilder.addParameters(S->getContext().release()); - Value *RTC = NodeBuilder.createRTC(Condition.release()); - Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); - - Builder.SetInsertPoint(&*StartBlock->begin()); - - NodeBuilder.create(Root); - } - - /// In case a sequential kernel has more surrounding loops as any parallel - /// kernel, the SCoP is probably mostly sequential. Hence, there is no - /// point in running it on a GPU. - if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) - CondBr->setOperand(0, Builder.getFalse()); - - if (!NodeBuilder.BuildSuccessful) - CondBr->setOperand(0, Builder.getFalse()); - } - - bool runOnScop(Scop &CurrentScop) override { - S = &CurrentScop; - LI = &getAnalysis().getLoopInfo(); - DT = &getAnalysis().getDomTree(); - SE = &getAnalysis().getSE(); - DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); - RI = &getAnalysis().getRegionInfo(); - - LLVM_DEBUG(dbgs() << "PPCGCodeGen running on : " << getUniqueScopName(S) - << " | loop depth: " << S->getMaxLoopDepth() << "\n"); - - // We currently do not support functions other than intrinsics inside - // kernels, as code generation will need to offload function calls to the - // kernel. This may lead to a kernel trying to call a function on the host. - // This also allows us to prevent codegen from trying to take the - // address of an intrinsic function to send to the kernel. - if (containsInvalidKernelFunction(CurrentScop, - Architecture == GPUArch::NVPTX64)) { - LLVM_DEBUG( - dbgs() << getUniqueScopName(S) - << " contains function which cannot be materialised in a GPU " - "kernel. Bailing out.\n";); - return false; - } - - auto PPCGScop = createPPCGScop(); - auto PPCGProg = createPPCGProg(PPCGScop); - auto PPCGGen = generateGPU(PPCGScop, PPCGProg); - - if (PPCGGen->tree) { - generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); - CurrentScop.markAsToBeSkipped(); - } else { - LLVM_DEBUG(dbgs() << getUniqueScopName(S) - << " has empty PPCGGen->tree. Bailing out.\n"); - } - - freeOptions(PPCGScop); - freePPCGGen(PPCGGen); - gpu_prog_free(PPCGProg); - ppcg_scop_free(PPCGScop); - - return true; - } - - void printScop(raw_ostream &, Scop &) const override {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - - // FIXME: We do not yet add regions for the newly generated code to the - // region tree. - } -}; -} // namespace - -char PPCGCodeGeneration::ID = 1; - -Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { - PPCGCodeGeneration *generator = new PPCGCodeGeneration(); - generator->Runtime = Runtime; - generator->Architecture = Arch; - return generator; -} - -INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", - "Polly - Apply PPCG translation to SCOP", false, false) -INITIALIZE_PASS_DEPENDENCY(DependenceInfo); -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); -INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); -INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", - "Polly - Apply PPCG translation to SCOP", false, false) diff --git a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp --- a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp +++ b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include "polly/CodeGen/RuntimeDebugBuilder.h" -#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Module.h" #include #include @@ -17,6 +16,16 @@ using namespace llvm; using namespace polly; +llvm::Value *RuntimeDebugBuilder::getPrintableString(PollyIRBuilder &Builder, + llvm::StringRef Str) { + // FIXME: addressspace(4) is a marker for a string (for the %s conversion + // specifier) but should be using the default address space. This only works + // because CPU backends typically ignore the address space. For constant + // strings as returned by getPrintableString, the format string should instead + // directly spell out the string. + return Builder.CreateGlobalStringPtr(Str, "", 4); +} + Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); const char *Name = "vprintf"; @@ -33,72 +42,9 @@ return F; } -Function *RuntimeDebugBuilder::getAddressSpaceCast(PollyIRBuilder &Builder, - unsigned Src, unsigned Dst, - unsigned SrcBits, - unsigned DstBits) { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - auto Name = std::string("llvm.nvvm.ptr.constant.to.gen.p") + - std::to_string(Dst) + "i" + std::to_string(DstBits) + ".p" + - std::to_string(Src) + "i" + std::to_string(SrcBits); - Function *F = M->getFunction(Name); - - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - FunctionType *Ty = FunctionType::get( - PointerType::get(Builder.getIntNTy(DstBits), Dst), - PointerType::get(Builder.getIntNTy(SrcBits), Src), false); - F = Function::Create(Ty, Linkage, Name, M); - } - - return F; -} - -std::vector -RuntimeDebugBuilder::getGPUThreadIdentifiers(PollyIRBuilder &Builder) { - std::vector Identifiers; - - auto M = Builder.GetInsertBlock()->getParent()->getParent(); - - std::vector BlockIDs = { - Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_x), - Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_y), - Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_z), - }; - - Identifiers.push_back(Builder.CreateGlobalStringPtr("> block-id: ", "", 4)); - for (auto GetID : BlockIDs) { - Value *Id = Builder.CreateCall(GetID, {}); - Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false); - Identifiers.push_back(Id); - Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4)); - } - - Identifiers.push_back(Builder.CreateGlobalStringPtr("| ", "", 4)); - - std::vector ThreadIDs = { - Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_x), - Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_y), - Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_z), - }; - - Identifiers.push_back(Builder.CreateGlobalStringPtr("thread-id: ", "", 4)); - for (auto GetId : ThreadIDs) { - Value *Id = Builder.CreateCall(GetId, {}); - Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false); - Identifiers.push_back(Id); - Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4)); - } - - return Identifiers; -} - -void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, bool IsGPU, +void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, ArrayRef Values) { - if (IsGPU) - createGPUPrinterT(Builder, Values); - else - createCPUPrinterT(Builder, Values); + createCPUPrinterT(Builder, Values); } bool RuntimeDebugBuilder::isPrintable(Type *Ty) { @@ -169,78 +115,6 @@ createFlush(Builder); } -void RuntimeDebugBuilder::createGPUPrinterT(PollyIRBuilder &Builder, - ArrayRef Values) { - std::string str; - - auto *Zero = Builder.getInt64(0); - - auto ToPrint = getGPUThreadIdentifiers(Builder); - - ToPrint.push_back(Builder.CreateGlobalStringPtr("\n ", "", 4)); - ToPrint.insert(ToPrint.end(), Values.begin(), Values.end()); - - const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); - - // Allocate print buffer (assuming 2*32 bit per element) - auto T = ArrayType::get(Builder.getInt32Ty(), ToPrint.size() * 2); - Value *Data = new AllocaInst( - T, DL.getAllocaAddrSpace(), "polly.vprint.buffer", - &Builder.GetInsertBlock()->getParent()->getEntryBlock().front()); - auto *DataPtr = Builder.CreateGEP(T, Data, {Zero, Zero}); - - int Offset = 0; - for (auto Val : ToPrint) { - auto Ptr = Builder.CreateGEP(Builder.getInt32Ty(), DataPtr, - Builder.getInt64(Offset)); - Type *Ty = Val->getType(); - - if (Ty->isFloatingPointTy()) { - if (!Ty->isDoubleTy()) - Val = Builder.CreateFPExt(Val, Builder.getDoubleTy()); - } else if (Ty->isIntegerTy()) { - if (Ty->getIntegerBitWidth() < 64) { - Val = Builder.CreateSExt(Val, Builder.getInt64Ty()); - } else { - assert(Ty->getIntegerBitWidth() == 64 && - "Integer types larger 64 bit not supported"); - // fallthrough - } - } else if (isa(Ty)) { - if (Ty == Builder.getInt8PtrTy(4)) { - // Pointers in constant address space are printed as strings - Val = Builder.CreateGEP(Builder.getInt8Ty(), Val, Builder.getInt64(0)); - auto F = RuntimeDebugBuilder::getAddressSpaceCast(Builder, 4, 0); - Val = Builder.CreateCall(F, Val); - } else { - Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty()); - } - } else { - llvm_unreachable("Unknown type"); - } - - Ty = Val->getType(); - Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Ty->getPointerTo(5)); - Builder.CreateAlignedStore(Val, Ptr, Align(4)); - - if (Ty->isFloatingPointTy()) - str += "%f"; - else if (Ty->isIntegerTy()) - str += "%ld"; - else - str += "%s"; - - Offset += 2; - } - - Value *Format = Builder.CreateGlobalStringPtr(str, "polly.vprintf.buffer", 4); - Format = Builder.CreateCall(getAddressSpaceCast(Builder, 4, 0), Format); - - Data = Builder.CreateBitCast(Data, Builder.getInt8PtrTy()); - - Builder.CreateCall(getVPrintF(Builder), {Format, Data}); -} - Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); const char *Name = "printf"; diff --git a/polly/lib/External/CMakeLists.txt b/polly/lib/External/CMakeLists.txt --- a/polly/lib/External/CMakeLists.txt +++ b/polly/lib/External/CMakeLists.txt @@ -314,91 +314,3 @@ target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS}) target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS}) endif (POLLY_BUNDLED_ISL) - - -# External: Polyhedral Parallel Code Generator -if (GPU_CODEGEN) - set(PET_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pet") - set(PPCG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ppcg") - set(PPCG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/ppcg") - - # Determine version of ppcg - if (EXISTS "${PPCG_SOURCE_DIR}/GIT_HEAD_ID") - # The source comes from a 'make dist' archive - file(READ "${PPCG_SOURCE_DIR}/GIT_HEAD_ID" PPCG_GIT_HEAD_ID) - string(STRIP "${PPCG_GIT_HEAD_ID}" PPCG_GIT_HEAD_ID) - elseif (EXISTS "${PPCG_SOURCE_DIR}/gitversion.h") - # The source directory is preconfigured - file(READ "${PPCG_SOURCE_DIR}/gitversion.h" GITVERSION_H) - string(REGEX REPLACE ".*\\\"([^\\\"]*)\\\".*" "\\1" PPCG_GIT_HEAD_ID "${GITVERSION_H}") - elseif () - # Unknown revision - # TODO: We could look for a .git and get the revision from HEAD - set(PPCG_GIT_HEAD_ID "UNKNOWN") - endif () - - message(STATUS "PPCG version: ${PPCG_GIT_HEAD_ID}") - - set (PPCG_FILES - ppcg/cuda.c - ppcg/cuda_common.c - ppcg/external.c - ppcg/gpu_array_tile.c - ppcg/gpu.c - ppcg/gpu_array_tile.c - ppcg/gpu_group.c - ppcg/gpu_hybrid.c - ppcg/gpu_print.c - ppcg/gpu_tree.c - ppcg/grouping.c - ppcg/hybrid.c - ppcg/ppcg.c - ppcg/ppcg_options.c - ppcg/print.c - ppcg/schedule.c - ppcg/util.c - ) - - include_directories(BEFORE - ${PPCG_BINARY_DIR} - ${PPCG_SOURCE_DIR}/imath - ${PPCG_SOURCE_DIR}/include - ${PET_SOURCE_DIR}/include - ) - - add_polly_library(PollyPPCG - ${PPCG_FILES} - ) - - target_link_libraries(PollyPPCG PUBLIC ${ISL_TARGET}) - - # Disable warnings for upstream projects. - if (MSVC) - set(DISABLE_WARNING_FLAGS - -wd4018 # 'expression' : signed/unsigned mismatch - -wd4090 # 'operation' : different 'modifier' qualifiers - -wd4200 # nonstandard extension used: zero-sized array in struct/union - -wd4201 # nonstandard extension used: nameless struct/union - -wd4334 # 'operator': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?) - -wd4221 # nonstandard extension used : 'identifier' : cannot be initialized using address of automatic variable - ) - if (POLLY_BUNDLED_ISL) - target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS}) - target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS}) - endif (POLLY_BUNDLED_ISL) - target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS}) - else () - if (POLLY_BUNDLED_ISL) - set_target_properties(PollyISL polly-isl-test PROPERTIES COMPILE_FLAGS "-w") - endif (POLLY_BUNDLED_ISL) - set_target_properties(PollyPPCG PROPERTIES COMPILE_FLAGS "-w") - endif () - - if(MSVC) - # In the Windows API (with some exceptions), the maximum length for a path is - # MAX_PATH, which is defined as 260 characters. - target_compile_definitions(PollyPPCG PRIVATE "-DPATH_MAX=260") - endif () - - target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS}) -endif () diff --git a/polly/lib/External/pet/include/pet.h b/polly/lib/External/pet/include/pet.h deleted file mode 100644 --- a/polly/lib/External/pet/include/pet.h +++ /dev/null @@ -1,622 +0,0 @@ -#ifndef PET_H -#define PET_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - -struct pet_options; -ISL_ARG_DECL(pet_options, struct pet_options, pet_options_args) - -/* Create an isl_ctx that references the pet options. */ -isl_ctx *isl_ctx_alloc_with_pet_options(); - -/* If autodetect is set, any valid scop is extracted. - * Otherwise, the scop needs to be delimited by pragmas. - */ -int pet_options_set_autodetect(isl_ctx *ctx, int val); -int pet_options_get_autodetect(isl_ctx *ctx); - -int pet_options_set_detect_conditional_assignment(isl_ctx *ctx, int val); -int pet_options_get_detect_conditional_assignment(isl_ctx *ctx); - -/* If encapsulate-dynamic-control is set, then any dynamic control - * in the input program will be encapsulated in macro statements. - * This means in particular that no statements with arguments - * will be created. - */ -int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val); -int pet_options_get_encapsulate_dynamic_control(isl_ctx *ctx); - -#define PET_OVERFLOW_AVOID 0 -#define PET_OVERFLOW_IGNORE 1 -int pet_options_set_signed_overflow(isl_ctx *ctx, int val); -int pet_options_get_signed_overflow(isl_ctx *ctx); - -struct pet_loc; -typedef struct pet_loc pet_loc; - -/* Return an additional reference to "loc". */ -__isl_give pet_loc *pet_loc_copy(__isl_keep pet_loc *loc); -/* Free a reference to "loc". */ -pet_loc *pet_loc_free(__isl_take pet_loc *loc); - -/* Return the offset in the input file of the start of "loc". */ -unsigned pet_loc_get_start(__isl_keep pet_loc *loc); -/* Return the offset in the input file of the character after "loc". */ -unsigned pet_loc_get_end(__isl_keep pet_loc *loc); -/* Return the line number of a line within the "loc" region. */ -int pet_loc_get_line(__isl_keep pet_loc *loc); -/* Return the indentation of the "loc" region. */ -__isl_keep const char *pet_loc_get_indent(__isl_keep pet_loc *loc); - -enum pet_expr_type { - pet_expr_error = -1, - pet_expr_access, - pet_expr_call, - pet_expr_cast, - pet_expr_int, - pet_expr_double, - pet_expr_op -}; - -enum pet_op_type { - /* only compound assignments operators before assignment */ - pet_op_add_assign, - pet_op_sub_assign, - pet_op_mul_assign, - pet_op_div_assign, - pet_op_and_assign, - pet_op_xor_assign, - pet_op_or_assign, - pet_op_assign, - pet_op_add, - pet_op_sub, - pet_op_mul, - pet_op_div, - pet_op_mod, - pet_op_shl, - pet_op_shr, - pet_op_eq, - pet_op_ne, - pet_op_le, - pet_op_ge, - pet_op_lt, - pet_op_gt, - pet_op_minus, - pet_op_post_inc, - pet_op_post_dec, - pet_op_pre_inc, - pet_op_pre_dec, - pet_op_address_of, - pet_op_assume, - pet_op_kill, - pet_op_and, - pet_op_xor, - pet_op_or, - pet_op_not, - pet_op_land, - pet_op_lor, - pet_op_lnot, - pet_op_cond, - pet_op_last -}; - -/* Index into the pet_expr->args array when pet_expr->type == pet_expr_unary - */ -enum pet_un_arg_type { - pet_un_arg -}; - -/* Indices into the pet_expr->args array when - * pet_expr->type == pet_expr_binary - */ -enum pet_bin_arg_type { - pet_bin_lhs, - pet_bin_rhs -}; - -/* Indices into the pet_expr->args array when - * pet_expr->type == pet_expr_ternary - */ -enum pet_ter_arg_type { - pet_ter_cond, - pet_ter_true, - pet_ter_false -}; - -struct pet_expr; -typedef struct pet_expr pet_expr; - -/* Return an additional reference to "expr". */ -__isl_give pet_expr *pet_expr_copy(__isl_keep pet_expr *expr); -/* Free a reference to "expr". */ -__isl_null pet_expr *pet_expr_free(__isl_take pet_expr *expr); - -/* Return the isl_ctx in which "expr" was created. */ -isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr); - -/* Return the type of "expr". */ -enum pet_expr_type pet_expr_get_type(__isl_keep pet_expr *expr); -/* Return the number of arguments of "expr". */ -int pet_expr_get_n_arg(__isl_keep pet_expr *expr); -/* Set the number of arguments of "expr" to "n". */ -__isl_give pet_expr *pet_expr_set_n_arg(__isl_take pet_expr *expr, int n); -/* Return the argument of "expr" at position "pos". */ -__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos); -/* Replace the argument of "expr" at position "pos" by "arg". */ -__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos, - __isl_take pet_expr *arg); - -/* Return the operation type of operation expression "expr". */ -enum pet_op_type pet_expr_op_get_type(__isl_keep pet_expr *expr); -/* Replace the operation type of operation expression "expr" by "type". */ -__isl_give pet_expr *pet_expr_op_set_type(__isl_take pet_expr *expr, - enum pet_op_type type); - -/* Construct a (read) access pet_expr from an index expression. */ -__isl_give pet_expr *pet_expr_from_index(__isl_take isl_multi_pw_aff *index); - -/* Does "expr" represent an affine expression? */ -isl_bool pet_expr_is_affine(__isl_keep pet_expr *expr); -/* Does the access expression "expr" read the accessed elements? */ -isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr); -/* Does the access expression "expr" write to the accessed elements? */ -isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr); -/* Does the access expression "expr" kill the accessed elements? */ -isl_bool pet_expr_access_is_kill(__isl_keep pet_expr *expr); -/* Mark "expr" as a read depending on "read". */ -__isl_give pet_expr *pet_expr_access_set_read(__isl_take pet_expr *expr, - int read); -/* Mark "expr" as a write depending on "write". */ -__isl_give pet_expr *pet_expr_access_set_write(__isl_take pet_expr *expr, - int write); -/* Mark "expr" as a kill depending on "kill". */ -__isl_give pet_expr *pet_expr_access_set_kill(__isl_take pet_expr *expr, - int kill); -/* Return the reference identifier of access expression "expr". */ -__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr); -/* Replace the reference identifier of access expression "expr" by "ref_id". */ -__isl_give pet_expr *pet_expr_access_set_ref_id(__isl_take pet_expr *expr, - __isl_take isl_id *ref_id); -/* Return the identifier of the outer array accessed by "expr". */ -__isl_give isl_id *pet_expr_access_get_id(__isl_keep pet_expr *expr); -/* Return the index expression of access expression "expr". */ -__isl_give isl_multi_pw_aff *pet_expr_access_get_index( - __isl_keep pet_expr *expr); - -/* Return the potential read access relation of access expression "expr". */ -__isl_give isl_union_map *pet_expr_access_get_may_read( - __isl_keep pet_expr *expr); -/* Return the potential write access relation of access expression "expr". */ -__isl_give isl_union_map *pet_expr_access_get_may_write( - __isl_keep pet_expr *expr); -/* Return the definite write access relation of access expression "expr". */ -__isl_give isl_union_map *pet_expr_access_get_must_write( - __isl_keep pet_expr *expr); -/* Return the argument dependent potential read access relation of "expr". */ -__isl_give isl_union_map *pet_expr_access_get_dependent_may_read( - __isl_keep pet_expr *expr); -/* Return the argument dependent potential write access relation of "expr". */ -__isl_give isl_union_map *pet_expr_access_get_dependent_may_write( - __isl_keep pet_expr *expr); -/* Return the argument dependent definite write access relation of "expr". */ -__isl_give isl_union_map *pet_expr_access_get_dependent_must_write( - __isl_keep pet_expr *expr); -/* Return the tagged potential read access relation of access "expr". */ -__isl_give isl_union_map *pet_expr_access_get_tagged_may_read( - __isl_keep pet_expr *expr); -/* Return the tagged potential write access relation of access "expr". */ -__isl_give isl_union_map *pet_expr_access_get_tagged_may_write( - __isl_keep pet_expr *expr); - -/* Return the name of the function called by "expr". */ -__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr); -/* Replace the name of the function called by "expr" by "name". */ -__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr, - __isl_keep const char *name); - -/* Create a pet_expr representing a cast of "arg" to "type_name". */ -__isl_give pet_expr *pet_expr_new_cast(const char *type_name, - __isl_take pet_expr *arg); -/* Replace the type of the cast performed by "expr" by "name". */ -__isl_give pet_expr *pet_expr_cast_set_type_name(__isl_take pet_expr *expr, - __isl_keep const char *name); - -/* Return the value of the integer represented by "expr". */ -__isl_give isl_val *pet_expr_int_get_val(__isl_keep pet_expr *expr); -/* Replace the value of the integer represented by "expr" by "v". */ -__isl_give pet_expr *pet_expr_int_set_val(__isl_take pet_expr *expr, - __isl_take isl_val *v); - -/* Return a string representation of the double expression "expr". */ -__isl_give char *pet_expr_double_get_str(__isl_keep pet_expr *expr); -/* Replace value and string representation of the double expression "expr" */ -__isl_give pet_expr *pet_expr_double_set(__isl_take pet_expr *expr, - double d, __isl_keep const char *s); - -/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_access. */ -int pet_expr_foreach_access_expr(__isl_keep pet_expr *expr, - int (*fn)(__isl_keep pet_expr *expr, void *user), void *user); -/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_call. */ -int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr, - int (*fn)(__isl_keep pet_expr *expr, void *user), void *user); - -struct pet_context; -typedef struct pet_context pet_context; - -/* Create a context with the given domain. */ -__isl_give pet_context *pet_context_alloc(__isl_take isl_set *domain); -/* Return an additional reference to "pc". */ -__isl_give pet_context *pet_context_copy(__isl_keep pet_context *pc); -/* Free a reference to "pc". */ -__isl_null pet_context *pet_context_free(__isl_take pet_context *pc); - -/* Return the isl_ctx in which "pc" was created. */ -isl_ctx *pet_context_get_ctx(__isl_keep pet_context *pc); - -/* Extract an affine expression defined over the domain of "pc" from "expr" - * or return NaN. - */ -__isl_give isl_pw_aff *pet_expr_extract_affine(__isl_keep pet_expr *expr, - __isl_keep pet_context *pc); - -void pet_expr_dump(__isl_keep pet_expr *expr); - -enum pet_tree_type { - pet_tree_error = -1, - pet_tree_expr, - pet_tree_block, - pet_tree_break, - pet_tree_continue, - pet_tree_decl, /* A declaration without initialization */ - pet_tree_decl_init, /* A declaration with initialization */ - pet_tree_if, /* An if without an else branch */ - pet_tree_if_else, /* An if with an else branch */ - pet_tree_for, - pet_tree_infinite_loop, - pet_tree_while, - pet_tree_return, -}; - -struct pet_tree; -typedef struct pet_tree pet_tree; - -/* Return the isl_ctx in which "tree" was created. */ -isl_ctx *pet_tree_get_ctx(__isl_keep pet_tree *tree); - -/* Return an additional reference to "tree". */ -__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree); -/* Free a reference to "tree". */ -__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree); - -/* Return the location of "tree". */ -__isl_give pet_loc *pet_tree_get_loc(__isl_keep pet_tree *tree); - -/* Return the type of "tree". */ -enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree); - -/* Return the expression of the expression tree "tree". */ -__isl_give pet_expr *pet_tree_expr_get_expr(__isl_keep pet_tree *tree); - -/* Return the expression returned by the return tree "tree". */ -__isl_give pet_expr *pet_tree_return_get_expr(__isl_keep pet_tree *tree); - -/* Return the number of children of the block tree "tree". */ -int pet_tree_block_n_child(__isl_keep pet_tree *tree); -/* Return child "pos" of the block tree "tree". */ -__isl_give pet_tree *pet_tree_block_get_child(__isl_keep pet_tree *tree, - int pos); - -/* Is "tree" a declaration (with or without initialization)? */ -int pet_tree_is_decl(__isl_keep pet_tree *tree); -/* Return the variable declared by the declaration tree "tree". */ -__isl_give pet_expr *pet_tree_decl_get_var(__isl_keep pet_tree *tree); -/* Return the initial value of the pet_tree_decl_init tree "tree". */ -__isl_give pet_expr *pet_tree_decl_get_init(__isl_keep pet_tree *tree); - -/* Return the condition of the if tree "tree". */ -__isl_give pet_expr *pet_tree_if_get_cond(__isl_keep pet_tree *tree); -/* Return the then branch of the if tree "tree". */ -__isl_give pet_tree *pet_tree_if_get_then(__isl_keep pet_tree *tree); -/* Return the else branch of the if tree with else branch "tree". */ -__isl_give pet_tree *pet_tree_if_get_else(__isl_keep pet_tree *tree); - -/* Is "tree" a for loop, a while loop or an infinite loop? */ -int pet_tree_is_loop(__isl_keep pet_tree *tree); -/* Return the induction variable of the for loop "tree" */ -__isl_give pet_expr *pet_tree_loop_get_var(__isl_keep pet_tree *tree); -/* Return the initial value of the induction variable of the for loop "tree" */ -__isl_give pet_expr *pet_tree_loop_get_init(__isl_keep pet_tree *tree); -/* Return the condition of the loop tree "tree" */ -__isl_give pet_expr *pet_tree_loop_get_cond(__isl_keep pet_tree *tree); -/* Return the induction variable of the for loop "tree" */ -__isl_give pet_expr *pet_tree_loop_get_inc(__isl_keep pet_tree *tree); -/* Return the body of the loop tree "tree" */ -__isl_give pet_tree *pet_tree_loop_get_body(__isl_keep pet_tree *tree); - -/* Call "fn" on each top-level expression in the nodes of "tree" */ -int pet_tree_foreach_expr(__isl_keep pet_tree *tree, - int (*fn)(__isl_keep pet_expr *expr, void *user), void *user); -/* Call "fn" on each access subexpression in the nodes of "tree" */ -int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree, - int (*fn)(__isl_keep pet_expr *expr, void *user), void *user); -/* Modify all call subexpressions in the nodes of "tree" through "fn". */ -__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree, - __isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user), - void *user); - -void pet_tree_dump(__isl_keep pet_tree *tree); - -/* "loc" represents the region of the source code that is represented - * by this statement. - * - * If the statement has arguments, i.e., n_arg != 0, then - * "domain" is a wrapped map, mapping the iteration domain - * to the values of the arguments for which this statement - * is executed. - * Otherwise, it is simply the iteration domain. - * - * If one of the arguments is an access expression that accesses - * more than one element for a given iteration, then the constraints - * on the value of this argument (encoded in "domain") should be satisfied - * for all of those accessed elements. - */ -struct pet_stmt { - pet_loc *loc; - isl_set *domain; - pet_tree *body; - - unsigned n_arg; - pet_expr **args; -}; - -/* Return the iteration space of "stmt". */ -__isl_give isl_space *pet_stmt_get_space(struct pet_stmt *stmt); - -/* Is "stmt" an assignment statement? */ -int pet_stmt_is_assign(struct pet_stmt *stmt); -/* Is "stmt" a kill statement? */ -int pet_stmt_is_kill(struct pet_stmt *stmt); - -/* pet_stmt_build_ast_exprs is currently limited to only handle - * some forms of data dependent accesses. - * If pet_stmt_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs - * can safely be called on "stmt". - */ -int pet_stmt_can_build_ast_exprs(struct pet_stmt *stmt); -/* Construct an associative array from reference identifiers of - * access expressions in "stmt" to the corresponding isl_ast_expr. - * Each index expression is first transformed through "fn_index" - * (if not NULL). Then an AST expression is generated using "build". - * Finally, the AST expression is transformed using "fn_expr" - * (if not NULL). - */ -__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt, - __isl_keep isl_ast_build *build, - __isl_give isl_multi_pw_aff *(*fn_index)( - __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id, - void *user), void *user_index, - __isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr, - __isl_keep isl_id *id, void *user), void *user_expr); - -/* Print "stmt" to "p". - * - * The access expressions in "stmt" are replaced by the isl_ast_expr - * associated to its reference identifier in "ref2expr". - */ -__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt, - __isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr); - -/* This structure represents a defined type. - * "name" is the name of the type, while "definition" is a string - * representation of its definition. - */ -struct pet_type { - char *name; - char *definition; -}; - -/* context holds constraints on the parameter that ensure that - * this array has a valid (i.e., non-negative) size - * - * extent holds constraints on the indices - * - * value_bounds holds constraints on the elements of the array - * and may be NULL if no such constraints were specified by the user - * - * element_size is the size in bytes of each array element - * element_type is the type of the array elements. - * element_is_record is set if this type is a record type. - * - * live_out is set if the array appears in a live-out pragma - * - * if uniquely_defined is set then the array is written by a single access - * such that any element that is ever read - * is known to be assigned exactly once before the read - * - * declared is set if the array was declared somewhere inside the scop. - * exposed is set if the declared array is visible outside the scop. - * outer is set if the type of the array elements is a record and - * the fields of this record are represented by separate pet_array structures. - */ -struct pet_array { - isl_set *context; - isl_set *extent; - isl_set *value_bounds; - char *element_type; - int element_is_record; - int element_size; - int live_out; - int uniquely_defined; - int declared; - int exposed; - int outer; -}; - -/* This structure represents an implication on a boolean filter. - * In particular, if the filter value of an element in the domain - * of "extension" is equal to "satisfied", then the filter values - * of the corresponding images in "extension" are also equal - * to "satisfied". - */ -struct pet_implication { - int satisfied; - isl_map *extension; -}; - -/* This structure represents an independence implied by a for loop - * that is marked as independent in the source code. - * "filter" contains pairs of statement instances that are guaranteed - * not to be dependent on each other based on the independent for loop, - * assuming that no dependences carried by this loop are implied - * by the variables in "local". - * "local" contains the variables that are local to the loop that was - * marked independent. - */ -struct pet_independence { - isl_union_map *filter; - isl_union_set *local; -}; - -/* "loc" represents the region of the source code that is represented - * by this scop. - * If the scop was detected based on scop and endscop pragmas, then - * the lines containing these pragmas are included in this region. - * In the final result, the context describes the set of parameter values - * for which the scop can be executed. - * During the construction of the pet_scop, the context lives in a set space - * where each dimension refers to an outer loop. - * context_value describes assignments to the parameters (if any) - * outside of the scop. - * - * "schedule" is the schedule of the statements in the scop. - * - * The n_type types define types that may be referenced from by the arrays. - * - * The n_implication implications describe implications on boolean filters. - * - * The n_independence independences describe independences implied - * by for loops that are marked independent in the source code. - */ -struct pet_scop { - pet_loc *loc; - - isl_set *context; - isl_set *context_value; - isl_schedule *schedule; - - int n_type; - struct pet_type **types; - - int n_array; - struct pet_array **arrays; - - int n_stmt; - struct pet_stmt **stmts; - - int n_implication; - struct pet_implication **implications; - - int n_independence; - struct pet_independence **independences; -}; -typedef struct pet_scop pet_scop; - -/* Return a textual representation of the operator. */ -const char *pet_op_str(enum pet_op_type op); -int pet_op_is_inc_dec(enum pet_op_type op); - -/* Extract a pet_scop from a C source file. - * If function is not NULL, then the pet_scop is extracted from - * a function with that name. - */ -__isl_give pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx, - const char *filename, const char *function); - -/* Transform the C source file "input" by rewriting each scop - * When autodetecting scops, at most one scop per function is rewritten. - * The transformed C code is written to "output". - */ -int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output, - __isl_give isl_printer *(*transform)(__isl_take isl_printer *p, - __isl_take pet_scop *scop, void *user), void *user); -/* Given a scop and a printer passed to a pet_transform_C_source callback, - * print the original corresponding code to the printer. - */ -__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop, - __isl_take isl_printer *p); - -/* Update all isl_sets and isl_maps such that they all have the same - * parameters in the same order. - */ -__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop); - -/* Does "scop" contain any data dependent accesses? */ -int pet_scop_has_data_dependent_accesses(__isl_keep pet_scop *scop); -/* Does "scop" contain any data dependent conditions? */ -int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop); -/* pet_stmt_build_ast_exprs is currently limited to only handle - * some forms of data dependent accesses. - * If pet_scop_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs - * can safely be called on all statements in the scop. - */ -int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop); - -void pet_scop_dump(__isl_keep pet_scop *scop); -__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop); - -/* Return the context of "scop". */ -__isl_give isl_set *pet_scop_get_context(__isl_keep pet_scop *scop); -/* Return the schedule of "scop". */ -__isl_give isl_schedule *pet_scop_get_schedule(__isl_keep pet_scop *scop); -/* Return the set of all statement instances. */ -__isl_give isl_union_set *pet_scop_get_instance_set(__isl_keep pet_scop *scop); -/* Return the potential read access relation. */ -__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop); -/* Return the tagged potential read access relation. */ -__isl_give isl_union_map *pet_scop_get_tagged_may_reads( - __isl_keep pet_scop *scop); -/* Return the potential write access relation. */ -__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop); -/* Return the definite write access relation. */ -__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop); -/* Return the tagged potential write access relation. */ -__isl_give isl_union_map *pet_scop_get_tagged_may_writes( - __isl_keep pet_scop *scop); -/* Return the tagged definite write access relation. */ -__isl_give isl_union_map *pet_scop_get_tagged_must_writes( - __isl_keep pet_scop *scop); -/* Return the definite kill access relation. */ -__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop); -/* Return the tagged definite kill access relation. */ -__isl_give isl_union_map *pet_scop_get_tagged_must_kills( - __isl_keep pet_scop *scop); - -/* Compute a mapping from all outermost arrays (of structs) in scop - * to their innermost members. - */ -__isl_give isl_union_map *pet_scop_compute_outer_to_inner( - __isl_keep pet_scop *scop); -/* Compute a mapping from all outermost arrays (of structs) in scop - * to their members, including the outermost arrays themselves. - */ -__isl_give isl_union_map *pet_scop_compute_outer_to_any( - __isl_keep pet_scop *scop); - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/polly/lib/External/ppcg/ChangeLog b/polly/lib/External/ppcg/ChangeLog deleted file mode 100644 --- a/polly/lib/External/ppcg/ChangeLog +++ /dev/null @@ -1,29 +0,0 @@ -version: 0.07 -date: Tue Feb 7 17:23:22 CET 2017 -changes: - - support hybrid tiling ---- -version: 0.06 -date: Fri May 6 12:08:50 CEST 2016 -changes: - - use PPCG specific macro names in generated code - - complete transition to schedule trees - - maximize coincidence by default - - map arrays with constant index expressions to private memory - - optionally group chains of statements ---- -version: 0.05 -date: Fri Jan 15 09:30:23 CET 2016 -changes: - - fix live-out computation - - optionally compute schedule for C target - - optionally perform tiling for C target - - create single kernel for non-permutable subtree ---- -version: 0.04 -date: Wed Jun 17 10:52:58 CEST 2015 -changes: - - use schedule trees - - fix live-range reordering - - improve generation of synchronization - - exploit independences during dependence analysis diff --git a/polly/lib/External/ppcg/GIT_HEAD_ID b/polly/lib/External/ppcg/GIT_HEAD_ID deleted file mode 100644 --- a/polly/lib/External/ppcg/GIT_HEAD_ID +++ /dev/null @@ -1 +0,0 @@ -ppcg-0.07 diff --git a/polly/lib/External/ppcg/README b/polly/lib/External/ppcg/README deleted file mode 100644 --- a/polly/lib/External/ppcg/README +++ /dev/null @@ -1,246 +0,0 @@ -Requirements: - -- automake, autoconf, libtool - (not needed when compiling a release) -- pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config) - (not needed when compiling a release using the included isl and pet) -- gmp (http://gmplib.org/) -- libyaml (http://pyyaml.org/wiki/LibYAML) - (only needed if you want to compile the pet executable) -- LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html) - Unless you have some other reasons for wanting to use the svn version, - it is best to install the latest release (3.9). - For more details, see pet/README. - -If you are installing on Ubuntu, then you can install the following packages: - -automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm - -Note that you need at least version 3.2 of libclang-dev (ubuntu raring). -Older versions of this package did not include the required libraries. -If you are using an older version of ubuntu, then you need to compile and -install LLVM/clang from source. - - -Preparing: - -Grab the latest release and extract it or get the source from -the git repository as follows. This process requires autoconf, -automake, libtool and pkg-config. - - git clone git://repo.or.cz/ppcg.git - cd ppcg - ./get_submodules.sh - ./autogen.sh - - -Compilation: - - ./configure - make - make check - -If you have installed any of the required libraries in a non-standard -location, then you may need to use the --with-gmp-prefix, ---with-libyaml-prefix and/or --with-clang-prefix options -when calling "./configure". - - -Using PPCG to generate CUDA or OpenCL code - -To convert a fragment of a C program to CUDA, insert a line containing - - #pragma scop - -before the fragment and add a line containing - - #pragma endscop - -after the fragment. To generate CUDA code run - - ppcg --target=cuda file.c - -where file.c is the file containing the fragment. The generated -code is stored in file_host.cu and file_kernel.cu. - -To generate OpenCL code run - - ppcg --target=opencl file.c - -where file.c is the file containing the fragment. The generated code -is stored in file_host.c and file_kernel.cl. - - -Specifying tile, grid and block sizes - -The iterations space tile size, grid size and block size can -be specified using the --sizes option. The argument is a union map -in isl notation mapping kernels identified by their sequence number -in a "kernel" space to singleton sets in the "tile", "grid" and "block" -spaces. The sizes are specified outermost to innermost. - -The dimension of the "tile" space indicates the (maximal) number of loop -dimensions to tile. The elements of the single integer tuple -specify the tile sizes in each dimension. -In case of hybrid tiling, the first element is half the size of -the tile in the time (sequential) dimension. The second element -specifies the number of elements in the base of the hexagon. -The remaining elements specify the tile sizes in the remaining space -dimensions. - -The dimension of the "grid" space indicates the (maximal) number of block -dimensions in the grid. The elements of the single integer tuple -specify the number of blocks in each dimension. - -The dimension of the "block" space indicates the (maximal) number of thread -dimensions in the grid. The elements of the single integer tuple -specify the number of threads in each dimension. - -For example, - - { kernel[0] -> tile[64,64]; kernel[i] -> block[16] : i != 4 } - -specifies that in kernel 0, two loops should be tiled with a tile -size of 64 in both dimensions and that all kernels except kernel 4 -should be run using a block of 16 threads. - -Since PPCG performs some scheduling, it can be difficult to predict -what exactly will end up in a kernel. If you want to specify -tile, grid or block sizes, you may want to run PPCG first with the defaults, -examine the kernels and then run PPCG again with the desired sizes. -Instead of examining the kernels, you can also specify the option ---dump-sizes on the first run to obtain the effectively used default sizes. - - -Compiling the generated CUDA code with nvcc - -To get optimal performance from nvcc, it is important to choose --arch -according to your target GPU. Specifically, use the flag "--arch sm_20" -for fermi, "--arch sm_30" for GK10x Kepler and "--arch sm_35" for -GK110 Kepler. We discourage the use of older cards as we have seen -correctness issues with compilation for older architectures. -Note that in the absence of any --arch flag, nvcc defaults to -"--arch sm_13". This will not only be slower, but can also cause -correctness issues. -If you want to obtain results that are identical to those obtained -by the original code, then you may need to disable some optimizations -by passing the "--fmad=false" option. - - -Compiling the generated OpenCL code with gcc - -To compile the host code you need to link against the file -ocl_utilities.c which contains utility functions used by the generated -OpenCL host code. To compile the host code with gcc, run - - gcc -std=c99 file_host.c ocl_utilities.c -lOpenCL - -Note that we have experienced the generated OpenCL code freezing -on some inputs (e.g., the PolyBench symm benchmark) when using -at least some version of the Nvidia OpenCL library, while the -corresponding CUDA code runs fine. -We have experienced no such freezes when using AMD, ARM or Intel -OpenCL libraries. - -By default, the compiled executable will need the _kernel.cl file at -run time. Alternatively, the option --opencl-embed-kernel-code may be -given to place the kernel code in a string literal. The kernel code is -then compiled into the host binary, such that the _kernel.cl file is no -longer needed at run time. Any kernel include files, in particular -those supplied using --opencl-include-file, will still be required at -run time. - - -Function calls - -Function calls inside the analyzed fragment are reproduced -in the CUDA or OpenCL code, but for now it is left to the user -to make sure that the functions that are being called are -available from the generated kernels. - -In the case of OpenCL code, the --opencl-include-file option -may be used to specify one or more files to be #include'd -from the generated code. These files may then contain -the definitions of the functions being called from the -program fragment. If the pathnames of the included files -are relative to the current directory, then you may need -to additionally specify the --opencl-compiler-options=-I. -to make sure that the files can be found by the OpenCL compiler. -The included files may contain definitions of types used by the -generated kernels. By default, PPCG generates definitions for -types as needed, but these definitions may collide with those in -the included files, as PPCG does not consider the contents of the -included files. The --no-opencl-print-kernel-types will prevent -PPCG from generating type definitions. - - -GNU extensions - -By default, PPCG may print out macro definitions that involve -GNU extensions such as __typeof__ and statement expressions. -Some compilers may not support these extensions. -In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918) -has been reported not to support __typeof__. -The use of these extensions can be turned off with the ---no-allow-gnu-extensions option. - - -Processing PolyBench - -When processing a PolyBench/C 3.2 benchmark, you should always specify --DPOLYBENCH_USE_C99_PROTO on the ppcg command line. Otherwise, the source -files are inconsistent, having fixed size arrays but parametrically -bounded loops iterating over them. -However, you should not specify this define when compiling -the PPCG generated code using nvcc since CUDA does not support VLAs. - - -CUDA and function overloading - -While CUDA supports function overloading based on the arguments types, -no such function overloading exists in the input language C. Since PPCG -simply prints out the same function name as in the original code, this -may result in a different function being called based on the types -of the arguments. For example, if the original code contains a call -to the function sqrt() with a float argument, then the argument will -be promoted to a double and the sqrt() function will be called. -In the transformed (CUDA) code, however, overloading will cause the -function sqrtf() to be called. Until this issue has been resolved in PPCG, -we recommend that users either explicitly call the function sqrtf() or -explicitly cast the argument to double in the input code. - - -Contact - -For bug reports, feature requests and questions, -contact http://groups.google.com/group/isl-development - -Whenever you report a bug, please mention the exact version of PPCG -that you are using (output of "./ppcg --version"). If you are unable -to compile PPCG, then report the git version (output of "git describe") -or the version number included in the name of the tarball. - - -Citing PPCG - -If you use PPCG for your research, you are invited to cite -the following paper. - -@article{Verdoolaege2013PPCG, - author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and - G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and - Catthoor, Francky}, - title = {Polyhedral parallel code generation for CUDA}, - journal = {ACM Trans. Archit. Code Optim.}, - issue_date = {January 2013}, - volume = {9}, - number = {4}, - month = jan, - year = {2013}, - issn = {1544-3566}, - pages = {54:1--54:23}, - doi = {10.1145/2400682.2400713}, - acmid = {2400713}, - publisher = {ACM}, - address = {New York, NY, USA}, -} diff --git a/polly/lib/External/ppcg/cpu.h b/polly/lib/External/ppcg/cpu.h deleted file mode 100644 --- a/polly/lib/External/ppcg/cpu.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _CPU_H -#define _CPU_H - -#include - -#include "ppcg.h" - -struct ppcg_options; - -__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, - struct ppcg_scop *ps, struct ppcg_options *options); -int generate_cpu(isl_ctx *ctx, struct ppcg_options *options, - const char *input, const char *output); - -#endif diff --git a/polly/lib/External/ppcg/cpu.c b/polly/lib/External/ppcg/cpu.c deleted file mode 100644 --- a/polly/lib/External/ppcg/cpu.c +++ /dev/null @@ -1,802 +0,0 @@ -/* - * Copyright 2012 INRIA Paris-Rocquencourt - * Copyright 2012 Ecole Normale Superieure - * - * Use of this software is governed by the MIT license - * - * Written by Tobias Grosser, INRIA Paris-Rocquencourt, - * Domaine de Voluceau, Rocquenqourt, B.P. 105, - * 78153 Le Chesnay Cedex France - * and Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ppcg.h" -#include "ppcg_options.h" -#include "cpu.h" -#include "print.h" -#include "schedule.h" -#include "util.h" - -/* Representation of a statement inside a generated AST. - * - * "stmt" refers to the original statement. - * "ref2expr" maps the reference identifier of each access in - * the statement to an AST expression that should be printed - * at the place of the access. - */ -struct ppcg_stmt { - struct pet_stmt *stmt; - - isl_id_to_ast_expr *ref2expr; -}; - -static void ppcg_stmt_free(void *user) -{ - struct ppcg_stmt *stmt = user; - - if (!stmt) - return; - - isl_id_to_ast_expr_free(stmt->ref2expr); - - free(stmt); -} - -/* Derive the output file name from the input file name. - * 'input' is the entire path of the input file. The output - * is the file name plus the additional extension. - * - * We will basically replace everything after the last point - * with '.ppcg.c'. This means file.c becomes file.ppcg.c - */ -static FILE *get_output_file(const char *input, const char *output) -{ - char name[PATH_MAX]; - const char *ext; - const char ppcg_marker[] = ".ppcg"; - int len; - FILE *file; - - len = ppcg_extract_base_name(name, input); - - strcpy(name + len, ppcg_marker); - ext = strrchr(input, '.'); - strcpy(name + len + sizeof(ppcg_marker) - 1, ext ? ext : ".c"); - - if (!output) - output = name; - - file = fopen(output, "w"); - if (!file) { - fprintf(stderr, "Unable to open '%s' for writing\n", output); - return NULL; - } - - return file; -} - -/* Data used to annotate for nodes in the ast. - */ -struct ast_node_userinfo { - /* The for node is an openmp parallel for node. */ - int is_openmp; -}; - -/* Information used while building the ast. - */ -struct ast_build_userinfo { - /* The current ppcg scop. */ - struct ppcg_scop *scop; - - /* Are we currently in a parallel for loop? */ - int in_parallel_for; -}; - -/* Check if the current scheduling dimension is parallel. - * - * We check for parallelism by verifying that the loop does not carry any - * dependences. - * If the live_range_reordering option is set, then this currently - * includes the order dependences. In principle, non-zero order dependences - * could be allowed, but this would require privatization and/or expansion. - * - * Parallelism test: if the distance is zero in all outer dimensions, then it - * has to be zero in the current dimension as well. - * Implementation: first, translate dependences into time space, then force - * outer dimensions to be equal. If the distance is zero in the current - * dimension, then the loop is parallel. - * The distance is zero in the current dimension if it is a subset of a map - * with equal values for the current dimension. - */ -static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build, - struct ppcg_scop *scop) -{ - isl_union_map *schedule, *deps; - isl_map *schedule_deps, *test; - isl_space *schedule_space; - unsigned i, dimension, is_parallel; - - schedule = isl_ast_build_get_schedule(build); - schedule_space = isl_ast_build_get_schedule_space(build); - - dimension = isl_space_dim(schedule_space, isl_dim_out) - 1; - - deps = isl_union_map_copy(scop->dep_flow); - deps = isl_union_map_union(deps, isl_union_map_copy(scop->dep_false)); - if (scop->options->live_range_reordering) { - isl_union_map *order = isl_union_map_copy(scop->dep_order); - deps = isl_union_map_union(deps, order); - } - deps = isl_union_map_apply_range(deps, isl_union_map_copy(schedule)); - deps = isl_union_map_apply_domain(deps, schedule); - - if (isl_union_map_is_empty(deps)) { - isl_union_map_free(deps); - isl_space_free(schedule_space); - return 1; - } - - schedule_deps = isl_map_from_union_map(deps); - - for (i = 0; i < dimension; i++) - schedule_deps = isl_map_equate(schedule_deps, isl_dim_out, i, - isl_dim_in, i); - - test = isl_map_universe(isl_map_get_space(schedule_deps)); - test = isl_map_equate(test, isl_dim_out, dimension, isl_dim_in, - dimension); - is_parallel = isl_map_is_subset(schedule_deps, test); - - isl_space_free(schedule_space); - isl_map_free(test); - isl_map_free(schedule_deps); - - return is_parallel; -} - -/* Mark a for node openmp parallel, if it is the outermost parallel for node. - */ -static void mark_openmp_parallel(__isl_keep isl_ast_build *build, - struct ast_build_userinfo *build_info, - struct ast_node_userinfo *node_info) -{ - if (build_info->in_parallel_for) - return; - - if (ast_schedule_dim_is_parallel(build, build_info->scop)) { - build_info->in_parallel_for = 1; - node_info->is_openmp = 1; - } -} - -/* Allocate an ast_node_info structure and initialize it with default values. - */ -static struct ast_node_userinfo *allocate_ast_node_userinfo() -{ - struct ast_node_userinfo *node_info; - node_info = (struct ast_node_userinfo *) - malloc(sizeof(struct ast_node_userinfo)); - node_info->is_openmp = 0; - return node_info; -} - -/* Free an ast_node_info structure. - */ -static void free_ast_node_userinfo(void *ptr) -{ - struct ast_node_userinfo *info; - info = (struct ast_node_userinfo *) ptr; - free(info); -} - -/* This method is executed before the construction of a for node. It creates - * an isl_id that is used to annotate the subsequently generated ast for nodes. - * - * In this function we also run the following analyses: - * - * - Detection of openmp parallel loops - */ -static __isl_give isl_id *ast_build_before_for( - __isl_keep isl_ast_build *build, void *user) -{ - isl_id *id; - struct ast_build_userinfo *build_info; - struct ast_node_userinfo *node_info; - - build_info = (struct ast_build_userinfo *) user; - node_info = allocate_ast_node_userinfo(); - id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info); - id = isl_id_set_free_user(id, free_ast_node_userinfo); - - mark_openmp_parallel(build, build_info, node_info); - - return id; -} - -/* This method is executed after the construction of a for node. - * - * It performs the following actions: - * - * - Reset the 'in_parallel_for' flag, as soon as we leave a for node, - * that is marked as openmp parallel. - * - */ -static __isl_give isl_ast_node *ast_build_after_for( - __isl_take isl_ast_node *node, __isl_keep isl_ast_build *build, - void *user) -{ - isl_id *id; - struct ast_build_userinfo *build_info; - struct ast_node_userinfo *info; - - id = isl_ast_node_get_annotation(node); - info = isl_id_get_user(id); - - if (info && info->is_openmp) { - build_info = (struct ast_build_userinfo *) user; - build_info->in_parallel_for = 0; - } - - isl_id_free(id); - - return node; -} - -/* Find the element in scop->stmts that has the given "id". - */ -static struct pet_stmt *find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id) -{ - int i; - - for (i = 0; i < scop->pet->n_stmt; ++i) { - struct pet_stmt *stmt = scop->pet->stmts[i]; - isl_id *id_i; - - id_i = isl_set_get_tuple_id(stmt->domain); - isl_id_free(id_i); - - if (id_i == id) - return stmt; - } - - isl_die(isl_id_get_ctx(id), isl_error_internal, - "statement not found", return NULL); -} - -/* Print a user statement in the generated AST. - * The ppcg_stmt has been attached to the node in at_each_domain. - */ -static __isl_give isl_printer *print_user(__isl_take isl_printer *p, - __isl_take isl_ast_print_options *print_options, - __isl_keep isl_ast_node *node, void *user) -{ - struct ppcg_stmt *stmt; - isl_id *id; - - id = isl_ast_node_get_annotation(node); - stmt = isl_id_get_user(id); - isl_id_free(id); - - p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr); - - isl_ast_print_options_free(print_options); - - return p; -} - - -/* Print a for loop node as an openmp parallel loop. - * - * To print an openmp parallel loop we print a normal for loop, but add - * "#pragma openmp parallel for" in front. - * - * Variables that are declared within the body of this for loop are - * automatically openmp 'private'. Iterators declared outside of the - * for loop are automatically openmp 'shared'. As ppcg declares all iterators - * at the position where they are assigned, there is no need to explicitly mark - * variables. Their automatically assigned type is already correct. - * - * This function only generates valid OpenMP code, if the ast was generated - * with the 'atomic-bounds' option enabled. - * - */ -static __isl_give isl_printer *print_for_with_openmp( - __isl_keep isl_ast_node *node, __isl_take isl_printer *p, - __isl_take isl_ast_print_options *print_options) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "#pragma omp parallel for"); - p = isl_printer_end_line(p); - - p = isl_ast_node_for_print(node, p, print_options); - - return p; -} - -/* Print a for node. - * - * Depending on how the node is annotated, we either print a normal - * for node or an openmp parallel for node. - */ -static __isl_give isl_printer *print_for(__isl_take isl_printer *p, - __isl_take isl_ast_print_options *print_options, - __isl_keep isl_ast_node *node, void *user) -{ - isl_id *id; - int openmp; - - openmp = 0; - id = isl_ast_node_get_annotation(node); - - if (id) { - struct ast_node_userinfo *info; - - info = (struct ast_node_userinfo *) isl_id_get_user(id); - if (info && info->is_openmp) - openmp = 1; - } - - if (openmp) - p = print_for_with_openmp(node, p, print_options); - else - p = isl_ast_node_for_print(node, p, print_options); - - isl_id_free(id); - - return p; -} - -/* Index transformation callback for pet_stmt_build_ast_exprs. - * - * "index" expresses the array indices in terms of statement iterators - * "iterator_map" expresses the statement iterators in terms of - * AST loop iterators. - * - * The result expresses the array indices in terms of - * AST loop iterators. - */ -static __isl_give isl_multi_pw_aff *pullback_index( - __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user) -{ - isl_pw_multi_aff *iterator_map = user; - - iterator_map = isl_pw_multi_aff_copy(iterator_map); - return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map); -} - -/* Transform the accesses in the statement associated to the domain - * called by "node" to refer to the AST loop iterators, construct - * corresponding AST expressions using "build", - * collect them in a ppcg_stmt and annotate the node with the ppcg_stmt. - */ -static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build, void *user) -{ - struct ppcg_scop *scop = user; - isl_ast_expr *expr, *arg; - isl_ctx *ctx; - isl_id *id; - isl_map *map; - isl_pw_multi_aff *iterator_map; - struct ppcg_stmt *stmt; - - ctx = isl_ast_node_get_ctx(node); - stmt = isl_calloc_type(ctx, struct ppcg_stmt); - if (!stmt) - goto error; - - expr = isl_ast_node_user_get_expr(node); - arg = isl_ast_expr_get_op_arg(expr, 0); - isl_ast_expr_free(expr); - id = isl_ast_expr_get_id(arg); - isl_ast_expr_free(arg); - stmt->stmt = find_stmt(scop, id); - isl_id_free(id); - if (!stmt->stmt) - goto error; - - map = isl_map_from_union_map(isl_ast_build_get_schedule(build)); - map = isl_map_reverse(map); - iterator_map = isl_pw_multi_aff_from_map(map); - stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build, - &pullback_index, iterator_map, NULL, NULL); - isl_pw_multi_aff_free(iterator_map); - - id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt); - id = isl_id_set_free_user(id, &ppcg_stmt_free); - return isl_ast_node_set_annotation(node, id); -error: - ppcg_stmt_free(stmt); - return isl_ast_node_free(node); -} - -/* Set *depth (initialized to 0 by the caller) to the maximum - * of the schedule depths of the leaf nodes for which this function is called. - */ -static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user) -{ - int *depth = user; - int node_depth; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf) - return isl_bool_true; - node_depth = isl_schedule_node_get_schedule_depth(node); - if (node_depth > *depth) - *depth = node_depth; - - return isl_bool_false; -} - -/* This function is called for each node in a CPU AST. - * In case of a user node, print the macro definitions required - * for printing the AST expressions in the annotation, if any. - * For other nodes, return true such that descendants are also - * visited. - * - * In particular, print the macro definitions needed for the substitutions - * of the original user statements. - */ -static isl_bool at_node(__isl_keep isl_ast_node *node, void *user) -{ - struct ppcg_stmt *stmt; - isl_id *id; - isl_printer **p = user; - - if (isl_ast_node_get_type(node) != isl_ast_node_user) - return isl_bool_true; - - id = isl_ast_node_get_annotation(node); - stmt = isl_id_get_user(id); - isl_id_free(id); - - if (!stmt) - return isl_bool_error; - - *p = ppcg_print_body_macros(*p, stmt->ref2expr); - if (!*p) - return isl_bool_error; - - return isl_bool_false; -} - -/* Print the required macros for the CPU AST "node" to "p", - * including those needed for the user statements inside the AST. - */ -static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p, - __isl_keep isl_ast_node *node) -{ - if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0) - return isl_printer_free(p); - p = ppcg_print_macros(p, node); - return p; -} - -/* Code generate the scop 'scop' using "schedule" - * and print the corresponding C code to 'p'. - */ -static __isl_give isl_printer *print_scop(struct ppcg_scop *scop, - __isl_take isl_schedule *schedule, __isl_take isl_printer *p, - struct ppcg_options *options) -{ - isl_ctx *ctx = isl_printer_get_ctx(p); - isl_ast_build *build; - isl_ast_print_options *print_options; - isl_ast_node *tree; - isl_id_list *iterators; - struct ast_build_userinfo build_info; - int depth; - - depth = 0; - if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth, - &depth) < 0) - goto error; - - build = isl_ast_build_alloc(ctx); - iterators = ppcg_scop_generate_names(scop, depth, "c"); - build = isl_ast_build_set_iterators(build, iterators); - build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop); - - if (options->openmp) { - build_info.scop = scop; - build_info.in_parallel_for = 0; - - build = isl_ast_build_set_before_each_for(build, - &ast_build_before_for, - &build_info); - build = isl_ast_build_set_after_each_for(build, - &ast_build_after_for, - &build_info); - } - - tree = isl_ast_build_node_from_schedule(build, schedule); - isl_ast_build_free(build); - - print_options = isl_ast_print_options_alloc(ctx); - print_options = isl_ast_print_options_set_print_user(print_options, - &print_user, NULL); - - print_options = isl_ast_print_options_set_print_for(print_options, - &print_for, NULL); - - p = cpu_print_macros(p, tree); - p = isl_ast_node_print(tree, p, print_options); - - isl_ast_node_free(tree); - - return p; -error: - isl_schedule_free(schedule); - isl_printer_free(p); - return NULL; -} - -/* Tile the band node "node" with tile sizes "sizes" and - * mark all members of the resulting tile node as "atomic". - */ -static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node, - __isl_take isl_multi_val *sizes) -{ - node = isl_schedule_node_band_tile(node, sizes); - node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); - - return node; -} - -/* Tile "node", if it is a band node with at least 2 members. - * The tile sizes are set from the "tile_size" option. - */ -static __isl_give isl_schedule_node *tile_band( - __isl_take isl_schedule_node *node, void *user) -{ - struct ppcg_scop *scop = user; - int n; - isl_space *space; - isl_multi_val *sizes; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_band) - return node; - - n = isl_schedule_node_band_n_member(node); - if (n <= 1) - return node; - - space = isl_schedule_node_band_get_space(node); - sizes = ppcg_multi_val_from_int(space, scop->options->tile_size); - - return tile(node, sizes); -} - -/* Construct schedule constraints from the dependences in ps - * for the purpose of computing a schedule for a CPU. - * - * The proximity constraints are set to the flow dependences. - * - * If live-range reordering is allowed then the conditional validity - * constraints are set to the order dependences with the flow dependences - * as condition. That is, a live-range (flow dependence) will be either - * local to an iteration of a band or all adjacent order dependences - * will be respected by the band. - * The validity constraints are set to the union of the flow dependences - * and the forced dependences, while the coincidence constraints - * are set to the union of the flow dependences, the forced dependences and - * the order dependences. - * - * If live-range reordering is not allowed, then both the validity - * and the coincidence constraints are set to the union of the flow - * dependences and the false dependences. - * - * Note that the coincidence constraints are only set when the "openmp" - * options is set. Even though the way openmp pragmas are introduced - * does not rely on the coincident property of the schedule band members, - * the coincidence constraints do affect the way the schedule is constructed, - * such that more schedule dimensions should be detected as parallel - * by ast_schedule_dim_is_parallel. - * Since the order dependences are also taken into account by - * ast_schedule_dim_is_parallel, they are also added to - * the coincidence constraints. If the openmp handling learns - * how to privatize some memory, then the corresponding order - * dependences can be removed from the coincidence constraints. - */ -static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints( - struct ppcg_scop *ps) -{ - isl_schedule_constraints *sc; - isl_union_map *validity, *coincidence; - - sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain)); - if (ps->options->live_range_reordering) { - sc = isl_schedule_constraints_set_conditional_validity(sc, - isl_union_map_copy(ps->tagged_dep_flow), - isl_union_map_copy(ps->tagged_dep_order)); - validity = isl_union_map_copy(ps->dep_flow); - validity = isl_union_map_union(validity, - isl_union_map_copy(ps->dep_forced)); - if (ps->options->openmp) { - coincidence = isl_union_map_copy(validity); - coincidence = isl_union_map_union(coincidence, - isl_union_map_copy(ps->dep_order)); - } - } else { - validity = isl_union_map_copy(ps->dep_flow); - validity = isl_union_map_union(validity, - isl_union_map_copy(ps->dep_false)); - if (ps->options->openmp) - coincidence = isl_union_map_copy(validity); - } - if (ps->options->openmp) - sc = isl_schedule_constraints_set_coincidence(sc, coincidence); - sc = isl_schedule_constraints_set_validity(sc, validity); - sc = isl_schedule_constraints_set_proximity(sc, - isl_union_map_copy(ps->dep_flow)); - - return sc; -} - -/* Compute a schedule for the scop "ps". - * - * First derive the appropriate schedule constraints from the dependences - * in "ps" and then compute a schedule from those schedule constraints, - * possibly grouping statement instances based on the input schedule. - */ -static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps) -{ - isl_schedule_constraints *sc; - isl_schedule *schedule; - - if (!ps) - return NULL; - - sc = construct_cpu_schedule_constraints(ps); - - if (ps->options->debug->dump_schedule_constraints) - isl_schedule_constraints_dump(sc); - schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options); - - return schedule; -} - -/* Compute a new schedule to the scop "ps" if the reschedule option is set. - * Otherwise, return a copy of the original schedule. - */ -static __isl_give isl_schedule *optionally_compute_schedule(void *user) -{ - struct ppcg_scop *ps = user; - - if (!ps) - return NULL; - if (!ps->options->reschedule) - return isl_schedule_copy(ps->schedule); - return compute_cpu_schedule(ps); -} - -/* Compute a schedule based on the dependences in "ps" and - * tile it if requested by the user. - */ -static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps, - struct ppcg_options *options) -{ - isl_ctx *ctx; - isl_schedule *schedule; - - if (!ps) - return NULL; - - ctx = isl_union_set_get_ctx(ps->domain); - schedule = ppcg_get_schedule(ctx, options, - &optionally_compute_schedule, ps); - if (ps->options->tile) - schedule = isl_schedule_map_schedule_node_bottom_up(schedule, - &tile_band, ps); - - return schedule; -} - -/* Generate CPU code for the scop "ps" using "schedule" and - * print the corresponding C code to "p", including variable declarations. - */ -static __isl_give isl_printer *print_cpu_with_schedule( - __isl_take isl_printer *p, struct ppcg_scop *ps, - __isl_take isl_schedule *schedule, struct ppcg_options *options) -{ - int hidden; - isl_set *context; - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "/* ppcg generated CPU code */"); - p = isl_printer_end_line(p); - - p = isl_printer_start_line(p); - p = isl_printer_end_line(p); - - p = ppcg_set_macro_names(p); - p = ppcg_print_exposed_declarations(p, ps); - hidden = ppcg_scop_any_hidden_declarations(ps); - if (hidden) { - p = ppcg_start_block(p); - p = ppcg_print_hidden_declarations(p, ps); - } - - context = isl_set_copy(ps->context); - context = isl_set_from_params(context); - schedule = isl_schedule_insert_context(schedule, context); - if (options->debug->dump_final_schedule) - isl_schedule_dump(schedule); - p = print_scop(ps, schedule, p, options); - if (hidden) - p = ppcg_end_block(p); - - return p; -} - -/* Generate CPU code for the scop "ps" and print the corresponding C code - * to "p", including variable declarations. - */ -__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, - struct ppcg_scop *ps, struct ppcg_options *options) -{ - isl_schedule *schedule; - - schedule = isl_schedule_copy(ps->schedule); - return print_cpu_with_schedule(p, ps, schedule, options); -} - -/* Generate CPU code for "scop" and print it to "p". - * - * First obtain a schedule for "scop" and then print code for "scop" - * using that schedule. - */ -static __isl_give isl_printer *generate(__isl_take isl_printer *p, - struct ppcg_scop *scop, struct ppcg_options *options) -{ - isl_schedule *schedule; - - schedule = get_schedule(scop, options); - - return print_cpu_with_schedule(p, scop, schedule, options); -} - -/* Wrapper around generate for use as a ppcg_transform callback. - */ -static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p, - struct ppcg_scop *scop, void *user) -{ - struct ppcg_options *options = user; - - return generate(p, scop, options); -} - -/* Transform the code in the file called "input" by replacing - * all scops by corresponding CPU code and write the results to a file - * called "output". - */ -int generate_cpu(isl_ctx *ctx, struct ppcg_options *options, - const char *input, const char *output) -{ - FILE *output_file; - int r; - - output_file = get_output_file(input, output); - if (!output_file) - return -1; - - r = ppcg_transform(ctx, input, output_file, options, - &print_cpu_wrap, options); - - fclose(output_file); - - return r; -} diff --git a/polly/lib/External/ppcg/cuda.h b/polly/lib/External/ppcg/cuda.h deleted file mode 100644 --- a/polly/lib/External/ppcg/cuda.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _CUDA_H -#define _CUDA_H - -#include "ppcg_options.h" -#include "ppcg.h" - -int generate_cuda(isl_ctx *ctx, struct ppcg_options *options, - const char *input); - -__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, - __isl_take isl_ast_print_options *print_options, - __isl_keep isl_ast_node *node, void *user); -#endif diff --git a/polly/lib/External/ppcg/cuda.c b/polly/lib/External/ppcg/cuda.c deleted file mode 100644 --- a/polly/lib/External/ppcg/cuda.c +++ /dev/null @@ -1,730 +0,0 @@ -/* - * Copyright 2012 Ecole Normale Superieure - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France - */ - -#include -#include - -#include "cuda_common.h" -#include "cuda.h" -#include "gpu.h" -#include "gpu_print.h" -#include "print.h" -#include "util.h" - -static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p) -{ - const char *macros = - "#define cudaCheckReturn(ret) \\\n" - " do { \\\n" - " cudaError_t cudaCheckReturn_e = (ret); \\\n" - " if (cudaCheckReturn_e != cudaSuccess) { \\\n" - " fprintf(stderr, \"CUDA error: %s\\n\", " - "cudaGetErrorString(cudaCheckReturn_e)); \\\n" - " fflush(stderr); \\\n" - " } \\\n" - " assert(cudaCheckReturn_e == cudaSuccess); \\\n" - " } while(0)\n" - "#define cudaCheckKernel() \\\n" - " do { \\\n" - " cudaCheckReturn(cudaGetLastError()); \\\n" - " } while(0)\n\n"; - - p = isl_printer_print_str(p, macros); - return p; -} - -/* Print a declaration for the device array corresponding to "array" on "p". - */ -static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p, - struct gpu_array_info *array) -{ - int i; - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, array->type); - p = isl_printer_print_str(p, " "); - if (!array->linearize && array->n_index > 1) - p = isl_printer_print_str(p, "("); - p = isl_printer_print_str(p, "*dev_"); - p = isl_printer_print_str(p, array->name); - if (!array->linearize && array->n_index > 1) { - p = isl_printer_print_str(p, ")"); - for (i = 1; i < array->n_index; i++) { - isl_ast_expr *bound; - bound = isl_ast_expr_get_op_arg(array->bound_expr, - 1 + i); - p = isl_printer_print_str(p, "["); - p = isl_printer_print_ast_expr(p, bound); - p = isl_printer_print_str(p, "]"); - isl_ast_expr_free(bound); - } - } - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - - return p; -} - -static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p, - struct gpu_prog *prog) -{ - int i; - - for (i = 0; i < prog->n_array; ++i) { - if (!gpu_array_requires_device_allocation(&prog->array[i])) - continue; - - p = declare_device_array(p, &prog->array[i]); - } - p = isl_printer_start_line(p); - p = isl_printer_end_line(p); - return p; -} - -static __isl_give isl_printer *allocate_device_arrays( - __isl_take isl_printer *p, struct gpu_prog *prog) -{ - int i; - - for (i = 0; i < prog->n_array; ++i) { - struct gpu_array_info *array = &prog->array[i]; - - if (!gpu_array_requires_device_allocation(&prog->array[i])) - continue; - p = ppcg_ast_expr_print_macros(array->bound_expr, p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, - "cudaCheckReturn(cudaMalloc((void **) &dev_"); - p = isl_printer_print_str(p, prog->array[i].name); - p = isl_printer_print_str(p, ", "); - p = gpu_array_info_print_size(p, &prog->array[i]); - p = isl_printer_print_str(p, "));"); - p = isl_printer_end_line(p); - } - p = isl_printer_start_line(p); - p = isl_printer_end_line(p); - return p; -} - -static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p, - struct gpu_prog *prog) -{ - int i; - - for (i = 0; i < prog->n_array; ++i) { - if (!gpu_array_requires_device_allocation(&prog->array[i])) - continue; - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_"); - p = isl_printer_print_str(p, prog->array[i].name); - p = isl_printer_print_str(p, "));"); - p = isl_printer_end_line(p); - } - - return p; -} - -/* Print code to "p" for copying "array" from the host to the device - * in its entirety. The bounds on the extent of "array" have - * been precomputed in extract_array_info and are used in - * gpu_array_info_print_size. - */ -static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p, - struct gpu_array_info *array) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_"); - p = isl_printer_print_str(p, array->name); - p = isl_printer_print_str(p, ", "); - - if (gpu_array_is_scalar(array)) - p = isl_printer_print_str(p, "&"); - p = isl_printer_print_str(p, array->name); - p = isl_printer_print_str(p, ", "); - - p = gpu_array_info_print_size(p, array); - p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));"); - p = isl_printer_end_line(p); - - return p; -} - -/* Print code to "p" for copying "array" back from the device to the host - * in its entirety. The bounds on the extent of "array" have - * been precomputed in extract_array_info and are used in - * gpu_array_info_print_size. - */ -static __isl_give isl_printer *copy_array_from_device( - __isl_take isl_printer *p, struct gpu_array_info *array) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy("); - if (gpu_array_is_scalar(array)) - p = isl_printer_print_str(p, "&"); - p = isl_printer_print_str(p, array->name); - p = isl_printer_print_str(p, ", dev_"); - p = isl_printer_print_str(p, array->name); - p = isl_printer_print_str(p, ", "); - p = gpu_array_info_print_size(p, array); - p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));"); - p = isl_printer_end_line(p); - - return p; -} - -static __isl_give isl_printer* print_reverse_list(__isl_take isl_printer *p, int len, int *list) -{ - int i; - - if (len == 0) - return p; - - p = isl_printer_print_str(p, "("); - for (i = 0; i < len; ++i) { - if (i) - p = isl_printer_print_str(p, ", "); - p = isl_printer_print_int(p, list[len - 1 - i]); - } - return isl_printer_print_str(p, ")"); -} - -/* Print the effective grid size as a list of the sizes in each - * dimension, from innermost to outermost. - */ -static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p, - struct ppcg_kernel *kernel) -{ - int i; - int dim; - - dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set); - if (dim == 0) - return p; - - p = isl_printer_print_str(p, "("); - for (i = dim - 1; i >= 0; --i) { - isl_ast_expr *bound; - - bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i); - p = isl_printer_print_ast_expr(p, bound); - isl_ast_expr_free(bound); - - if (i > 0) - p = isl_printer_print_str(p, ", "); - } - - p = isl_printer_print_str(p, ")"); - - return p; -} - -/* Print the grid definition. - */ -static __isl_give isl_printer *print_grid(__isl_take isl_printer *p, - struct ppcg_kernel *kernel) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "dim3 k"); - p = isl_printer_print_int(p, kernel->id); - p = isl_printer_print_str(p, "_dimGrid"); - p = print_grid_size(p, kernel); - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - - return p; -} - -/* Print the arguments to a kernel declaration or call. If "types" is set, - * then print a declaration (including the types of the arguments). - * - * The arguments are printed in the following order - * - the arrays accessed by the kernel - * - the parameters - * - the host loop iterators - */ -static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p, - struct gpu_prog *prog, struct ppcg_kernel *kernel, int types) -{ - int i, n; - int first = 1; - unsigned nparam; - isl_space *space; - const char *type; - - for (i = 0; i < prog->n_array; ++i) { - int required; - - required = ppcg_kernel_requires_array_argument(kernel, i); - if (required < 0) - return isl_printer_free(p); - if (!required) - continue; - - if (!first) - p = isl_printer_print_str(p, ", "); - - if (types) - p = gpu_array_info_print_declaration_argument(p, - &prog->array[i], NULL); - else - p = gpu_array_info_print_call_argument(p, - &prog->array[i]); - - first = 0; - } - - space = isl_union_set_get_space(kernel->arrays); - nparam = isl_space_dim(space, isl_dim_param); - for (i = 0; i < nparam; ++i) { - const char *name; - - name = isl_space_get_dim_name(space, isl_dim_param, i); - - if (!first) - p = isl_printer_print_str(p, ", "); - if (types) - p = isl_printer_print_str(p, "int "); - p = isl_printer_print_str(p, name); - - first = 0; - } - isl_space_free(space); - - n = isl_space_dim(kernel->space, isl_dim_set); - type = isl_options_get_ast_iterator_type(prog->ctx); - for (i = 0; i < n; ++i) { - const char *name; - - if (!first) - p = isl_printer_print_str(p, ", "); - name = isl_space_get_dim_name(kernel->space, isl_dim_set, i); - if (types) { - p = isl_printer_print_str(p, type); - p = isl_printer_print_str(p, " "); - } - p = isl_printer_print_str(p, name); - - first = 0; - } - - return p; -} - -/* Print the header of the given kernel. - */ -static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p, - struct gpu_prog *prog, struct ppcg_kernel *kernel) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "__global__ void kernel"); - p = isl_printer_print_int(p, kernel->id); - p = isl_printer_print_str(p, "("); - p = print_kernel_arguments(p, prog, kernel, 1); - p = isl_printer_print_str(p, ")"); - - return p; -} - -/* Print the header of the given kernel to both gen->cuda.kernel_h - * and gen->cuda.kernel_c. - */ -static void print_kernel_headers(struct gpu_prog *prog, - struct ppcg_kernel *kernel, struct cuda_info *cuda) -{ - isl_printer *p; - - p = isl_printer_to_file(prog->ctx, cuda->kernel_h); - p = isl_printer_set_output_format(p, ISL_FORMAT_C); - p = print_kernel_header(p, prog, kernel); - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - isl_printer_free(p); - - p = isl_printer_to_file(prog->ctx, cuda->kernel_c); - p = isl_printer_set_output_format(p, ISL_FORMAT_C); - p = print_kernel_header(p, prog, kernel); - p = isl_printer_end_line(p); - isl_printer_free(p); -} - -static void print_indent(FILE *dst, int indent) -{ - fprintf(dst, "%*s", indent, ""); -} - -/* Print a list of iterators of type "type" with names "ids" to "out". - * Each iterator is assigned one of the cuda identifiers in cuda_dims. - * In particular, the last iterator is assigned the x identifier - * (the first in the list of cuda identifiers). - */ -static void print_iterators(FILE *out, const char *type, - __isl_keep isl_id_list *ids, const char *cuda_dims[]) -{ - int i, n; - - n = isl_id_list_n_id(ids); - if (n <= 0) - return; - print_indent(out, 4); - fprintf(out, "%s ", type); - for (i = 0; i < n; ++i) { - isl_id *id; - - if (i) - fprintf(out, ", "); - id = isl_id_list_get_id(ids, i); - fprintf(out, "%s = %s", isl_id_get_name(id), - cuda_dims[n - 1 - i]); - isl_id_free(id); - } - fprintf(out, ";\n"); -} - -static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel) -{ - isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree); - const char *type; - const char *block_dims[] = { "blockIdx.x", "blockIdx.y" }; - const char *thread_dims[] = { "threadIdx.x", "threadIdx.y", - "threadIdx.z" }; - - type = isl_options_get_ast_iterator_type(ctx); - - print_iterators(out, type, kernel->block_ids, block_dims); - print_iterators(out, type, kernel->thread_ids, thread_dims); -} - -static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p, - struct ppcg_kernel_var *var) -{ - int j; - - p = isl_printer_start_line(p); - if (var->type == ppcg_access_shared) - p = isl_printer_print_str(p, "__shared__ "); - p = isl_printer_print_str(p, var->array->type); - p = isl_printer_print_str(p, " "); - p = isl_printer_print_str(p, var->name); - for (j = 0; j < var->array->n_index; ++j) { - isl_val *v; - - p = isl_printer_print_str(p, "["); - v = isl_vec_get_element_val(var->size, j); - p = isl_printer_print_val(p, v); - isl_val_free(v); - p = isl_printer_print_str(p, "]"); - } - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - - return p; -} - -static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p, - struct ppcg_kernel *kernel) -{ - int i; - - for (i = 0; i < kernel->n_var; ++i) - p = print_kernel_var(p, &kernel->var[i]); - - return p; -} - -/* Print a sync statement. - */ -static __isl_give isl_printer *print_sync(__isl_take isl_printer *p, - struct ppcg_kernel_stmt *stmt) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "__syncthreads();"); - p = isl_printer_end_line(p); - - return p; -} - -/* This function is called for each user statement in the AST, - * i.e., for each kernel body statement, copy statement or sync statement. - */ -static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p, - __isl_take isl_ast_print_options *print_options, - __isl_keep isl_ast_node *node, void *user) -{ - isl_id *id; - struct ppcg_kernel_stmt *stmt; - - id = isl_ast_node_get_annotation(node); - stmt = isl_id_get_user(id); - isl_id_free(id); - - isl_ast_print_options_free(print_options); - - switch (stmt->type) { - case ppcg_kernel_copy: - return ppcg_kernel_print_copy(p, stmt); - case ppcg_kernel_sync: - return print_sync(p, stmt); - case ppcg_kernel_domain: - return ppcg_kernel_print_domain(p, stmt); - } - - return p; -} - -static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel, - struct cuda_info *cuda) -{ - isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree); - isl_ast_print_options *print_options; - isl_printer *p; - - print_kernel_headers(prog, kernel, cuda); - fprintf(cuda->kernel_c, "{\n"); - print_kernel_iterators(cuda->kernel_c, kernel); - - p = isl_printer_to_file(ctx, cuda->kernel_c); - p = isl_printer_set_output_format(p, ISL_FORMAT_C); - p = isl_printer_indent(p, 4); - - p = print_kernel_vars(p, kernel); - p = isl_printer_end_line(p); - p = ppcg_set_macro_names(p); - p = gpu_print_macros(p, kernel->tree); - - print_options = isl_ast_print_options_alloc(ctx); - print_options = isl_ast_print_options_set_print_user(print_options, - &print_kernel_stmt, NULL); - p = isl_ast_node_print(kernel->tree, p, print_options); - isl_printer_free(p); - - fprintf(cuda->kernel_c, "}\n"); -} - -/* Print code for initializing the device for execution of the transformed - * code. This includes declaring locally defined variables as well as - * declaring and allocating the required copies of arrays on the device. - */ -static __isl_give isl_printer *init_device(__isl_take isl_printer *p, - struct gpu_prog *prog) -{ - p = print_cuda_macros(p); - - p = gpu_print_local_declarations(p, prog); - p = declare_device_arrays(p, prog); - p = allocate_device_arrays(p, prog); - - return p; -} - -/* Print code for clearing the device after execution of the transformed code. - * In particular, free the memory that was allocated on the device. - */ -static __isl_give isl_printer *clear_device(__isl_take isl_printer *p, - struct gpu_prog *prog) -{ - p = free_device_arrays(p, prog); - - return p; -} - -/* Print a statement for copying an array to or from the device, - * or for initializing or clearing the device. - * The statement identifier of a copying node is called - * "to_device_" or "from_device_" and - * its user pointer points to the gpu_array_info of the array - * that needs to be copied. - * The node for initializing the device is called "init_device". - * The node for clearing the device is called "clear_device". - * - * Extract the array (if any) from the identifier and call - * init_device, clear_device, copy_array_to_device or copy_array_from_device. - */ -static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p, - __isl_keep isl_ast_node *node, struct gpu_prog *prog) -{ - isl_ast_expr *expr, *arg; - isl_id *id; - const char *name; - struct gpu_array_info *array; - - expr = isl_ast_node_user_get_expr(node); - arg = isl_ast_expr_get_op_arg(expr, 0); - id = isl_ast_expr_get_id(arg); - name = isl_id_get_name(id); - array = isl_id_get_user(id); - isl_id_free(id); - isl_ast_expr_free(arg); - isl_ast_expr_free(expr); - - if (!name) - return isl_printer_free(p); - if (!strcmp(name, "init_device")) - return init_device(p, prog); - if (!strcmp(name, "clear_device")) - return clear_device(p, prog); - if (!array) - return isl_printer_free(p); - - if (!prefixcmp(name, "to_device")) - return copy_array_to_device(p, array); - else - return copy_array_from_device(p, array); -} - -struct print_host_user_data { - struct cuda_info *cuda; - struct gpu_prog *prog; -}; - -/* Print the user statement of the host code to "p". - * - * The host code may contain original user statements, kernel launches, - * statements that copy data to/from the device and statements - * the initialize or clear the device. - * The original user statements and the kernel launches have - * an associated annotation, while the other statements do not. - * The latter are handled by print_device_node. - * The annotation on the user statements is called "user". - * - * In case of a kernel launch, print a block of statements that - * defines the grid and the block and then launches the kernel. - */ -__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, - __isl_take isl_ast_print_options *print_options, - __isl_keep isl_ast_node *node, void *user) -{ - isl_id *id; - int is_user; - struct ppcg_kernel *kernel; - struct ppcg_kernel_stmt *stmt; - struct print_host_user_data *data; - - isl_ast_print_options_free(print_options); - - data = (struct print_host_user_data *) user; - - id = isl_ast_node_get_annotation(node); - if (!id) - return print_device_node(p, node, data->prog); - - is_user = !strcmp(isl_id_get_name(id), "user"); - kernel = is_user ? NULL : isl_id_get_user(id); - stmt = is_user ? isl_id_get_user(id) : NULL; - isl_id_free(id); - - if (is_user) - return ppcg_kernel_print_domain(p, stmt); - - p = ppcg_start_block(p); - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "dim3 k"); - p = isl_printer_print_int(p, kernel->id); - p = isl_printer_print_str(p, "_dimBlock"); - p = print_reverse_list(p, kernel->n_block, kernel->block_dim); - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - - p = print_grid(p, kernel); - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "kernel"); - p = isl_printer_print_int(p, kernel->id); - p = isl_printer_print_str(p, " <<id); - p = isl_printer_print_str(p, "_dimGrid, k"); - p = isl_printer_print_int(p, kernel->id); - p = isl_printer_print_str(p, "_dimBlock>>> ("); - p = print_kernel_arguments(p, data->prog, kernel, 0); - p = isl_printer_print_str(p, ");"); - p = isl_printer_end_line(p); - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cudaCheckKernel();"); - p = isl_printer_end_line(p); - - p = ppcg_end_block(p); - - p = isl_printer_start_line(p); - p = isl_printer_end_line(p); - -#if 0 - print_kernel(data->prog, kernel, data->cuda); -#endif - - return p; -} - -static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p, - struct gpu_prog *prog, __isl_keep isl_ast_node *tree, - struct cuda_info *cuda) -{ - isl_ast_print_options *print_options; - isl_ctx *ctx = isl_ast_node_get_ctx(tree); - struct print_host_user_data data = { cuda, prog }; - - print_options = isl_ast_print_options_alloc(ctx); - print_options = isl_ast_print_options_set_print_user(print_options, - &print_host_user, &data); - - p = gpu_print_macros(p, tree); - p = isl_ast_node_print(tree, p, print_options); - - return p; -} - -/* Given a gpu_prog "prog" and the corresponding transformed AST - * "tree", print the entire CUDA code to "p". - * "types" collects the types for which a definition has already - * been printed. - */ -static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p, - struct gpu_prog *prog, __isl_keep isl_ast_node *tree, - struct gpu_types *types, void *user) -{ - struct cuda_info *cuda = user; - isl_printer *kernel; - - kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c); - kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C); - kernel = gpu_print_types(kernel, types, prog); - isl_printer_free(kernel); - - if (!kernel) - return isl_printer_free(p); - - p = print_host_code(p, prog, tree, cuda); - - return p; -} - -/* Transform the code in the file called "input" by replacing - * all scops by corresponding CUDA code. - * The names of the output files are derived from "input". - * - * We let generate_gpu do all the hard work and then let it call - * us back for printing the AST in print_cuda. - * - * To prepare for this printing, we first open the output files - * and we close them after generate_gpu has finished. - */ -int generate_cuda(isl_ctx *ctx, struct ppcg_options *options, - const char *input) -{ - struct cuda_info cuda; - int r; - - cuda_open_files(&cuda, input); - - r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda); - - cuda_close_files(&cuda); - - return r; -} diff --git a/polly/lib/External/ppcg/cuda_common.h b/polly/lib/External/ppcg/cuda_common.h deleted file mode 100644 --- a/polly/lib/External/ppcg/cuda_common.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _CUDA_COMMON_H_ -#define _CUDA_COMMON_H_ - -#include - -struct cuda_info { - FILE *host_c; - FILE *kernel_c; - FILE *kernel_h; -}; - -void cuda_open_files(struct cuda_info *info, const char *input); -void cuda_close_files(struct cuda_info *info); - -#endif diff --git a/polly/lib/External/ppcg/cuda_common.c b/polly/lib/External/ppcg/cuda_common.c deleted file mode 100644 --- a/polly/lib/External/ppcg/cuda_common.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2010 INRIA Saclay - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, - * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, - * 91893 Orsay, France - */ - -#include -#include -#include - -#include "cuda_common.h" -#include "ppcg.h" - -/* Open the host .cu file and the kernel .hu and .cu files for writing. - * Add the necessary includes. - */ -void cuda_open_files(struct cuda_info *info, const char *input) -{ - char name[PATH_MAX]; - int len; - - len = ppcg_extract_base_name(name, input); - - strcpy(name + len, "_host.cu"); - info->host_c = fopen(name, "w"); - - strcpy(name + len, "_kernel.cu"); - info->kernel_c = fopen(name, "w"); - - strcpy(name + len, "_kernel.hu"); - info->kernel_h = fopen(name, "w"); - fprintf(info->host_c, "#include \n"); - fprintf(info->host_c, "#include \n"); - fprintf(info->host_c, "#include \"%s\"\n", name); - fprintf(info->kernel_c, "#include \"%s\"\n", name); - fprintf(info->kernel_h, "#include \"cuda.h\"\n\n"); -} - -/* Close all output files. - */ -void cuda_close_files(struct cuda_info *info) -{ - fclose(info->kernel_c); - fclose(info->kernel_h); - fclose(info->host_c); -} diff --git a/polly/lib/External/ppcg/external.c b/polly/lib/External/ppcg/external.c deleted file mode 100644 --- a/polly/lib/External/ppcg/external.c +++ /dev/null @@ -1,192 +0,0 @@ -#include -#include -#include -#include -#include "cpu.h" -#include "opencl.h" - - -#define die() { \ - fprintf(stderr, "Dummy function %s called\n", __FUNCTION__); \ - abort(); \ -} - -__isl_give isl_union_map *pet_scop_compute_outer_to_any( - __isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_compute_outer_to_inner( - __isl_keep pet_scop *scop) { - die(); -} -enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree) { - die(); -} -int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree, - int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) { - die(); -} -isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr) { - die(); -} -isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr) { - die(); -} -isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr) { - die(); -} -__isl_give isl_union_map *pet_expr_access_get_tagged_may_read( - __isl_keep pet_expr *expr) { - die(); -} -__isl_give isl_union_map *pet_expr_access_get_tagged_may_write( - __isl_keep pet_expr *expr) { - die(); -} -__isl_give isl_union_map *pet_expr_access_get_must_write( - __isl_keep pet_expr *expr) { - die(); -} -__isl_give isl_multi_pw_aff *pet_expr_access_get_index( - __isl_keep pet_expr *expr) { - die(); -} -__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr) { - die(); -} -__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, - struct ppcg_scop *ps, struct ppcg_options *options) { - die(); -} - -__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt, - __isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr) { - die(); -} -unsigned pet_loc_get_start(__isl_keep pet_loc *loc) { - die(); -} -unsigned pet_loc_get_end(__isl_keep pet_loc *loc) { - die(); -} -int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output, - __isl_give isl_printer *(*transform)(__isl_take isl_printer *p, - __isl_take pet_scop *scop, void *user), void *user) { - die(); -} -__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop, - __isl_take isl_printer *p) { - die(); -} -__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop) { - die(); -} -__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop) { - die(); -} -int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop) { - die(); -} -int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop) { - die(); -} -int pet_tree_foreach_expr(__isl_keep pet_tree *tree, - int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) { - die(); -} -int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr, - int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) { - die(); -} -int pet_stmt_is_kill(struct pet_stmt *stmt) { - die(); -} -struct isl_args pet_options_args; -const char *ppcg_version(void) { - die(); -} -int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val) { - die(); -} -int generate_opencl(isl_ctx *ctx, struct ppcg_options *options, - const char *input, const char *output) { - die(); -} -int generate_cpu(isl_ctx *ctx, struct ppcg_options *options, - const char *input, const char *output) { - die(); -} -__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt, - __isl_keep isl_ast_build *build, - __isl_give isl_multi_pw_aff *(*fn_index)( - __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id, - void *user), void *user_index, - __isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr, - __isl_keep isl_id *id, void *user), void *user_expr) { - die(); -} -__isl_give isl_union_map *pet_scop_get_tagged_may_reads( - __isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_get_tagged_may_writes( - __isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_get_tagged_must_writes( - __isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop) { - die(); -} -__isl_give isl_union_map *pet_scop_get_tagged_must_kills( - __isl_keep pet_scop *scop) { - die(); -} -__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr) { - die(); -} -__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr, - __isl_keep const char *name) { - die(); -} -__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos) { - die(); -} -__isl_give pet_expr *pet_expr_new_cast(const char *type_name, - __isl_take pet_expr *arg) { - die(); -} -__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos, - __isl_take pet_expr *arg) { - die(); -} -__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree) { - die(); -} -__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree) { - die(); -} -__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree, - __isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user), - void *user) { - die(); -} -__isl_give isl_union_map *pet_expr_access_get_may_read( - __isl_keep pet_expr *expr) { - die(); -} -__isl_give isl_union_map *pet_expr_access_get_may_write( - __isl_keep pet_expr *expr) { - die(); -} diff --git a/polly/lib/External/ppcg/gpu.h b/polly/lib/External/ppcg/gpu.h deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu.h +++ /dev/null @@ -1,459 +0,0 @@ -#ifndef _GPU_H -#define _GPU_H - -#include -#include -#include - -#include - -#include "ppcg.h" -#include "ppcg_options.h" - -/* An access to an outer array element or an iterator. - * Accesses to iterators have an access relation that maps to an unnamed space. - * An access may be both read and write. - * If the access relation is empty, then the output dimension may - * not be equal to the dimension of the corresponding array. - */ -struct gpu_stmt_access { - /* Access reads elements */ - int read; - /* Access writes elements */ - int write; - /* All writes are definite writes. */ - int exact_write; - /* Is a single, fixed element being accessed? */ - isl_bool fixed_element; - /* The number of index expressions specified in the access. */ - int n_index; - - /* May access relation */ - isl_map *access; - /* May access relation with as domain a mapping from iteration domain - * to a reference identifier. - */ - isl_map *tagged_access; - /* The reference id of the corresponding pet_expr. */ - isl_id *ref_id; - - struct gpu_stmt_access *next; -}; - -/* A representation of a user statement. - * "stmt" points to the corresponding pet statement. - * "id" is the identifier of the instance set of the statement. - * "accesses" is a linked list of accesses performed by the statement. - * If the statement has been killed, i.e., if it will not be scheduled, - * then this linked list may be empty even if the actual statement does - * perform accesses. - */ -struct gpu_stmt { - isl_id *id; - struct pet_stmt *stmt; - - struct gpu_stmt_access *accesses; -}; - -/* Represents an outer array possibly accessed by a gpu_prog. - */ -struct gpu_array_info { - /* The array data space. */ - isl_space *space; - /* Element type. */ - char *type; - /* Element size. */ - int size; - /* Name of the array. */ - char *name; - /* Declared extent of original array. */ - isl_set *declared_extent; - /* AST expression for declared size of original array. */ - isl_ast_expr *declared_size; - /* Extent of the array that needs to be copied. */ - isl_set *extent; - /* Number of indices. */ - unsigned n_index; - /* For each index, a bound on "extent" in that direction. */ - isl_multi_pw_aff *bound; - /* The corresponding access AST expression, if the array needs - * to be allocated on the device. - */ - isl_ast_expr *bound_expr; - - /* All references to this array; point to elements of a linked list. */ - int n_ref; - struct gpu_stmt_access **refs; - - /* Is this array accessed at all by the program? */ - int accessed; - - /* Is this a scalar that is read-only within the entire program? */ - int read_only_scalar; - - /* Are the elements of the array structures? */ - int has_compound_element; - - /* Are the elements only accessed through constant index expressions? */ - int only_fixed_element; - - /* Is the array local to the scop? */ - int local; - /* Is the array local and should it be declared on the host? */ - int declare_local; - - /* Is the corresponding global device memory accessed in any way? */ - int global; - - /* Should the array be linearized? */ - int linearize; - - /* Order dependences on this array. - * Only used if live_range_reordering option is set. - * It is set to NULL otherwise. - */ - isl_union_map *dep_order; - - void *user; -}; - -/* Represents an outer array accessed by a ppcg_kernel, localized - * to the context of this kernel. - * - * "array" points to the corresponding array in the gpu_prog. - * The "n_group" "groups" are the reference groups associated to the array. - * If "force_private" is set, then the array (in practice a scalar) - * must be mapped to a register. - * "global" is set if the global device memory corresponding - * to this array is accessed by the kernel. - * "bound" is equal to array->bound specialized to the current kernel. - * "bound_expr" is the corresponding access AST expression. - */ -struct gpu_local_array_info { - struct gpu_array_info *array; - - int n_group; - struct gpu_array_ref_group **groups; - - int force_private; - int global; - - unsigned n_index; - isl_multi_pw_aff *bound; - isl_ast_expr *bound_expr; -}; - -__isl_give isl_ast_expr *gpu_local_array_info_linearize_index( - struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr); - -/* A sequence of "n" names of types. - */ -struct gpu_types { - int n; - char **name; -}; - -/* "read" and "write" contain the original access relations, possibly - * involving member accesses. - * - * The elements of "array", as well as the ranges of "copy_in" and "copy_out" - * only refer to the outer arrays of any possible member accesses. - */ -struct gpu_prog { - isl_ctx *ctx; - - struct ppcg_scop *scop; - - /* Set of parameter values */ - isl_set *context; - - /* All potential read accesses in the entire program */ - isl_union_map *read; - - /* All potential write accesses in the entire program */ - isl_union_map *may_write; - /* All definite write accesses in the entire program */ - isl_union_map *must_write; - /* All tagged definite kills in the entire program */ - isl_union_map *tagged_must_kill; - - /* The set of inner array elements that may be preserved. */ - isl_union_set *may_persist; - - /* A mapping from all innermost arrays to their outer arrays. */ - isl_union_map *to_outer; - /* A mapping from the outer arrays to all corresponding inner arrays. */ - isl_union_map *to_inner; - /* A mapping from all intermediate arrays to their outer arrays, - * including an identity mapping from the anonymous 1D space to itself. - */ - isl_union_map *any_to_outer; - - /* Order dependences on non-scalars. */ - isl_union_map *array_order; - - /* Array of statements */ - int n_stmts; - struct gpu_stmt *stmts; - - int n_array; - struct gpu_array_info *array; -}; - -struct gpu_gen { - isl_ctx *ctx; - struct ppcg_options *options; - - /* Callback for printing of AST in appropriate format. */ - __isl_give isl_printer *(*print)(__isl_take isl_printer *p, - struct gpu_prog *prog, __isl_keep isl_ast_node *tree, - struct gpu_types *types, void *user); - void *print_user; - - isl_id_to_ast_expr *(*build_ast_expr)(void *stmt, - isl_ast_build *build, - isl_multi_pw_aff *(*fn_index)( - __isl_take isl_multi_pw_aff *mpa, isl_id *id, - void *user), - void *user_index, - isl_ast_expr *(*fn_expr)(isl_ast_expr *expr, - isl_id *id, void *user), - void *user_expr); - - struct gpu_prog *prog; - /* The generated AST. */ - isl_ast_node *tree; - - /* The sequence of types for which a definition has been printed. */ - struct gpu_types types; - - /* User specified tile, grid and block sizes for each kernel */ - isl_union_map *sizes; - - /* Effectively used tile, grid and block sizes for each kernel */ - isl_union_map *used_sizes; - - /* Identifier of the next kernel. */ - int kernel_id; -}; - -enum ppcg_group_access_type { - ppcg_access_global, - ppcg_access_shared, - ppcg_access_private -}; - -enum ppcg_kernel_stmt_type { - ppcg_kernel_copy, - ppcg_kernel_domain, - ppcg_kernel_sync -}; - -/* Representation of special statements, in particular copy statements - * and __syncthreads statements, inside a kernel. - * - * type represents the kind of statement - * - * - * for ppcg_kernel_copy statements we have - * - * read is set if the statement should copy data from global memory - * to shared memory or registers. - * - * index expresses an access to the array element that needs to be copied - * local_index expresses the corresponding element in the tile - * - * array refers to the original array being copied - * local_array is a pointer to the appropriate element in the "array" - * array of the ppcg_kernel to which this copy access belongs - * - * - * for ppcg_kernel_domain statements we have - * - * stmt is the corresponding input statement - * - * n_access is the number of accesses in stmt - * access is an array of local information about the accesses - */ -struct ppcg_kernel_stmt { - enum ppcg_kernel_stmt_type type; - - union { - struct { - int read; - isl_ast_expr *index; - isl_ast_expr *local_index; - struct gpu_array_info *array; - struct gpu_local_array_info *local_array; - } c; - struct { - struct gpu_stmt *stmt; - isl_id_to_ast_expr *ref2expr; - } d; - } u; -}; - -/* Representation of a local variable in a kernel. - */ -struct ppcg_kernel_var { - struct gpu_array_info *array; - enum ppcg_group_access_type type; - char *name; - isl_vec *size; -}; - -/* Representation of a kernel. - * - * prog describes the original code from which the kernel is extracted. - * - * id is the sequence number of the kernel. - * - * block_ids contains the list of block identifiers for this kernel. - * thread_ids contains the list of thread identifiers for this kernel. - * - * the first n_grid elements of grid_dim represent the specified size - * of the grid. - * the first n_block elements of block_dim represent the specified or - * effective size of the block. - * Note that in the input file, the sizes of the grid and the blocks - * are specified in the order x, y, z, but internally, the sizes - * are stored in reverse order, so that the last element always - * refers to the x dimension. - * - * grid_size reflects the effective grid size. - * grid_size_expr contains a corresponding access AST expression, built within - * the context where the launch appears. - * - * context contains the values of the parameters and outer schedule dimensions - * for which any statement instance in this kernel needs to be executed. - * - * n_sync is the number of synchronization operations that have - * been introduced in the schedule tree corresponding to this kernel (so far). - * - * core contains the spaces of the statement domains that form - * the core computation of the kernel. It is used to navigate - * the tree during the construction of the device part of the schedule - * tree in gpu_create_kernel. - * - * expanded_domain contains the original statement instances, - * i.e., those that appear in the domains of access relations, - * that are involved in the kernel. - * contraction maps those original statement instances to - * the statement instances that are active at the point - * in the schedule tree where the kernel is created. - * - * arrays is the set of possibly accessed outer array elements. - * - * space is the schedule space of the AST context. That is, it represents - * the loops of the generated host code containing the kernel launch. - * - * n_array is the total number of arrays in the input program and also - * the number of element in the array array. - * array contains information about each array that is local - * to the current kernel. If an array is not used in a kernel, - * then the corresponding entry does not contain any information. - * - * any_force_private is set if any array in the kernel is marked force_private - * - * block_filter contains constraints on the domain elements in the kernel - * that encode the mapping to block identifiers, where the block identifiers - * are represented by "n_grid" parameters with as names the elements - * of "block_ids". - * - * thread_filter contains constraints on the domain elements in the kernel - * that encode the mapping to thread identifiers, where the thread identifiers - * are represented by "n_block" parameters with as names the elements - * of "thread_ids". - * - * copy_schedule corresponds to the schedule dimensions of - * the (tiled) schedule for this kernel that have been taken into account - * for computing private/shared memory tiles. - * The domain corresponds to the original statement instances, i.e., - * those that appear in the leaves of the schedule tree. - * copy_schedule_dim is the dimension of this schedule. - * - * sync_writes contains write references that require synchronization. - * Each reference is represented by a universe set in a space [S[i,j] -> R[]] - * with S[i,j] the statement instance space and R[] the array reference. - */ -struct ppcg_kernel { - isl_ctx *ctx; - struct ppcg_options *options; - - struct gpu_prog *prog; - - int id; - - isl_id_list *block_ids; - isl_id_list *thread_ids; - - int n_grid; - int n_block; - int grid_dim[2]; - int block_dim[3]; - - isl_multi_pw_aff *grid_size; - isl_ast_expr *grid_size_expr; - isl_set *context; - - int n_sync; - isl_union_set *core; - isl_union_set *arrays; - - isl_union_pw_multi_aff *contraction; - isl_union_set *expanded_domain; - - isl_space *space; - - int n_array; - struct gpu_local_array_info *array; - - int n_var; - struct ppcg_kernel_var *var; - - int any_force_private; - - isl_union_set *block_filter; - isl_union_set *thread_filter; - isl_union_pw_multi_aff *copy_schedule; - int copy_schedule_dim; - - isl_union_set *sync_writes; - - isl_ast_node *tree; -}; - -int gpu_array_is_scalar(struct gpu_array_info *array); -int gpu_array_is_read_only_scalar(struct gpu_array_info *array); -int gpu_array_requires_device_allocation(struct gpu_array_info *array); -__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array); -isl_bool gpu_array_can_be_private(struct gpu_array_info *array); - -struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop); -void *gpu_prog_free(struct gpu_prog *prog); - -int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i); - -int generate_gpu(isl_ctx *ctx, const char *input, FILE *out, - struct ppcg_options *options, - __isl_give isl_printer *(*print)(__isl_take isl_printer *p, - struct gpu_prog *prog, __isl_keep isl_ast_node *tree, - struct gpu_types *types, void *user), void *user); - -__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen, - __isl_take isl_schedule_node *node, int scale, - __isl_keep isl_multi_val *sizes); - -__isl_give isl_schedule *get_schedule(struct gpu_gen *gen); -int has_any_permutable_node(__isl_keep isl_schedule *schedule); -__isl_give isl_schedule *map_to_device(struct gpu_gen *gen, - __isl_take isl_schedule *schedule, - int to_from_device); -__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, - __isl_take isl_schedule *schedule); - -__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog); -void collect_references(struct gpu_prog *prog, struct gpu_array_info *array); -void collect_order_dependences(struct gpu_prog *prog); -isl_bool only_fixed_element_accessed(struct gpu_array_info *array); -#endif diff --git a/polly/lib/External/ppcg/gpu.c b/polly/lib/External/ppcg/gpu.c deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu.c +++ /dev/null @@ -1,5849 +0,0 @@ -/* - * Copyright 2010-2011 INRIA Saclay - * Copyright 2012-2013 Ecole Normale Superieure - * Copyright 2015-2016 Sven Verdoolaege - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, - * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, - * 91893 Orsay, France - * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cpu.h" -#include "gpu.h" -#include "gpu_array_tile.h" -#include "gpu_group.h" -#include "gpu_hybrid.h" -#include "gpu_tree.h" -#include "hybrid.h" -#include "schedule.h" -#include "ppcg_options.h" -#include "print.h" -#include "util.h" - -struct gpu_array_info; - -/* Return the name of the outer array (of structs) accessed by "access". - */ -static const char *get_outer_array_name(__isl_keep isl_map *access) -{ - isl_space *space; - const char *name; - - space = isl_space_range(isl_map_get_space(access)); - while (space && isl_space_is_wrapping(space)) - space = isl_space_domain(isl_space_unwrap(space)); - name = isl_space_get_tuple_name(space, isl_dim_set); - isl_space_free(space); - - return name; -} - -/* Collect all references to the given array and store pointers to them - * in array->refs. - */ -void collect_references(struct gpu_prog *prog, - struct gpu_array_info *array) -{ - int i; - int n; - - n = 0; - for (i = 0; i < prog->n_stmts; ++i) { - struct gpu_stmt *stmt = &prog->stmts[i]; - struct gpu_stmt_access *access; - - for (access = stmt->accesses; access; access = access->next) { - const char *name; - name = get_outer_array_name(access->access); - if (name && !strcmp(array->name, name)) - n++; - } - } - - array->n_ref = n; - array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n); - assert(array->refs); - - n = 0; - for (i = 0; i < prog->n_stmts; ++i) { - struct gpu_stmt *stmt = &prog->stmts[i]; - struct gpu_stmt_access *access; - - for (access = stmt->accesses; access; access = access->next) { - const char *name; - name = get_outer_array_name(access->access); - if (!name || strcmp(array->name, name)) - continue; - - array->refs[n++] = access; - } - } -} - -/* Compute and return the extent of "array", taking into account the set of - * accessed elements. - * - * In particular, the extent in the outer dimension is taken - * from "accessed", while the extents in the remaining dimensions - * are taken from array->extent. - * - * The extent in the outer dimension cannot be taken from array->extent - * because that may be unbounded. Furthermore, even if it is bounded, - * it may be larger than the piece of the array that is being accessed. - */ -static __isl_give isl_set *compute_extent(struct pet_array *array, - __isl_keep isl_set *accessed) -{ - int n_index; - isl_id *id; - isl_set *outer; - isl_set *extent; - - extent = isl_set_copy(array->extent); - - n_index = isl_set_dim(accessed, isl_dim_set); - if (n_index == 0) - return extent; - - extent = isl_set_project_out(extent, isl_dim_set, 0, 1); - outer = isl_set_copy(accessed); - outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1); - extent = isl_set_flat_product(outer, extent); - id = isl_set_get_tuple_id(accessed); - extent = isl_set_set_tuple_id(extent, id); - - return extent; -} - -/* Is the array "array" being extracted a read-only scalar? - * - * That is, is "array" a scalar that is never possibly written to. - * An array containing structures is never considered to be a scalar. - */ -static int is_read_only_scalar(struct gpu_array_info *array, - struct gpu_prog *prog) -{ - isl_set *space; - isl_union_map *write; - int empty; - - if (array->has_compound_element) - return 0; - if (array->n_index != 0) - return 0; - - write = isl_union_map_copy(prog->may_write); - space = isl_set_universe(isl_space_copy(array->space)); - write = isl_union_map_intersect_range(write, - isl_union_set_from_set(space)); - empty = isl_union_map_is_empty(write); - isl_union_map_free(write); - - return empty; -} - -/* Is "array" only accessed as individual, fixed elements? - * That is, does each access to "array" access a single, fixed element? - */ -isl_bool only_fixed_element_accessed(struct gpu_array_info *array) -{ - int i; - - for (i = 0; i < array->n_ref; ++i) - if (!array->refs[i]->fixed_element) - return isl_bool_false; - - return isl_bool_true; -} - -/* Compute bounds on the host array "pa" based on the corresponding - * accessed elements in "arrays" - * and collect all references to the array. - * Store the results in "info". - * - * If the array is zero-dimensional and does not contain structures, - * i.e., if the array is a scalar, we check whether it is read-only. - * We also check whether the array is accessed at all. - */ -static int extract_array_info(struct gpu_prog *prog, - struct gpu_array_info *info, struct pet_array *pa, - __isl_keep isl_union_set *arrays) -{ - int empty; - const char *name; - int n_index; - isl_multi_pw_aff *bounds; - isl_set *accessed, *extent; - - n_index = isl_set_dim(pa->extent, isl_dim_set); - name = isl_set_get_tuple_name(pa->extent); - - info->space = isl_set_get_space(pa->extent); - info->name = strdup(name); - info->n_index = n_index; - info->linearize = prog->scop->options->linearize_device_arrays; - - info->type = strdup(pa->element_type); - info->size = pa->element_size; - info->local = pa->declared && !pa->exposed; - info->has_compound_element = pa->element_is_record; - info->read_only_scalar = is_read_only_scalar(info, prog); - - info->declared_extent = isl_set_copy(pa->extent); - accessed = isl_union_set_extract_set(arrays, - isl_space_copy(info->space)); - empty = isl_set_is_empty(accessed); - extent = compute_extent(pa, accessed); - isl_set_free(accessed); - info->extent = extent; - if (empty < 0) - return -1; - info->accessed = !empty; - bounds = ppcg_size_from_extent(isl_set_copy(extent)); - bounds = isl_multi_pw_aff_gist(bounds, isl_set_copy(prog->context)); - if (!bounds) - return -1; - if (!isl_multi_pw_aff_is_cst(bounds)) - info->linearize = 1; - info->bound = bounds; - - collect_references(prog, info); - info->only_fixed_element = only_fixed_element_accessed(info); - - return 0; -} - -/* Remove independence from the order constraints "order" on array "array". - * Since the pairs of iterations in the filter relation of an independence - * are guaranteed to be completely independent by the user, there is - * no need to ensure that live ranges are ordered along those pairs. - * We make an exception for local variables, though, as the independence - * guarantee does not apply to those. - * - * The order constraints are used in two places. - * Those on scalars are used in check_scalar_live_ranges to check if - * we need to force the scalar to be private. Any non-local scalar - * should not be forced scalar if it only appears in independent loops. - * Those on non-scalars are added to the coincidence constraints - * in compute_schedule because we do not support any array expansion. - * Accesses to non-local arrays should not prevent a loop from being - * considered coincident so we should indeed remove those constraints - * from the order constraints. - */ -static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog, - struct gpu_array_info *array, __isl_take isl_union_map *order) -{ - // We do not have independence information in Polly. Hence, make this - // function a no-op. - return order; - int i; - - for (i = 0; i < prog->scop->pet->n_independence; ++i) { - struct pet_independence *pi = prog->scop->pet->independences[i]; - if (isl_union_set_contains(pi->local, array->space)) - continue; - - order = isl_union_map_subtract(order, - isl_union_map_copy(pi->filter)); - } - - return order; -} - -/* For each array in "prog", store the (untagged) order dependences - * derived from the array in array->dep_order. - * In particular, consider all references that access the given array - * and take the order dependences that have one of these references - * as source. (Since an order dependence relates two references to - * the same array, the target of these order dependences will also - * be one of these references.) - * Additionally, store the union of these array->dep_order relations - * for all arrays that cannot be mapped to private memory in prog->array_order. - */ -void collect_order_dependences(struct gpu_prog *prog) -{ - int i; - isl_space *space; - isl_union_map *accesses; - - space = isl_union_map_get_space(prog->read); - prog->array_order = isl_union_map_empty(space); - - accesses = isl_union_map_copy(prog->scop->tagged_reads); - accesses = isl_union_map_union(accesses, - isl_union_map_copy(prog->scop->tagged_may_writes)); - accesses = isl_union_map_universe(accesses); - accesses = isl_union_map_apply_range(accesses, - isl_union_map_copy(prog->to_outer)); - - for (i = 0; i < prog->n_array; ++i) { - struct gpu_array_info *array = &prog->array[i]; - isl_set *set; - isl_union_set *uset; - isl_union_map *order; - - set = isl_set_universe(isl_space_copy(array->space)); - uset = isl_union_set_from_set(set); - uset = isl_union_map_domain( - isl_union_map_intersect_range(isl_union_map_copy(accesses), - uset)); - order = isl_union_map_copy(prog->scop->tagged_dep_order); - order = isl_union_map_intersect_domain(order, uset); - order = isl_union_map_zip(order); - order = isl_union_set_unwrap(isl_union_map_domain(order)); - order = remove_independences(prog, array, order); - array->dep_order = order; - - if (gpu_array_can_be_private(array)) - continue; - - prog->array_order = isl_union_map_union(prog->array_order, - isl_union_map_copy(array->dep_order)); - } - - isl_union_map_free(accesses); -} - -/* Construct a gpu_array_info for each array referenced by prog->scop and - * collect them in prog->array. - * - * The sizes are based on the extents and the set of possibly accessed - * elements by "prog". - * If there are any member accesses involved, then they are first mapped - * to the outer arrays of structs. - * Only extract gpu_array_info entries for these outer arrays. - * - * If we are allowing live range reordering, then also set - * the dep_order field. Otherwise leave it NULL. - */ -static int collect_array_info(struct gpu_prog *prog) -{ - int i; - int r = 0; - isl_union_set *arrays; - - arrays = isl_union_map_range(isl_union_map_copy(prog->read)); - arrays = isl_union_set_union(arrays, - isl_union_map_range(isl_union_map_copy(prog->may_write))); - - arrays = isl_union_set_apply(arrays, - isl_union_map_copy(prog->to_outer)); - - arrays = isl_union_set_coalesce(arrays); - - prog->n_array = prog->scop->pet->n_array; - prog->array = isl_calloc_array(prog->ctx, - struct gpu_array_info, prog->n_array); - assert(prog->array); - prog->n_array = 0; - for (i = 0; i < prog->scop->pet->n_array; ++i) { - isl_bool field; - - field = isl_set_is_wrapping(prog->scop->pet->arrays[i]->extent); - if (field < 0) - break; - if (field) - continue; - if (extract_array_info(prog, &prog->array[prog->n_array++], - prog->scop->pet->arrays[i], arrays) < 0) - r = -1; - } - if (i < prog->scop->pet->n_array) - r = -1; - - isl_union_set_free(arrays); - - if (prog->scop->options->live_range_reordering) - collect_order_dependences(prog); - - return r; -} - -static void free_array_info(struct gpu_prog *prog) -{ - int i; - - for (i = 0; i < prog->n_array; ++i) { - free(prog->array[i].type); - free(prog->array[i].name); - isl_multi_pw_aff_free(prog->array[i].bound); - isl_ast_expr_free(prog->array[i].bound_expr); - isl_space_free(prog->array[i].space); - isl_set_free(prog->array[i].declared_extent); - isl_set_free(prog->array[i].extent); - isl_ast_expr_free(prog->array[i].declared_size); - free(prog->array[i].refs); - isl_union_map_free(prog->array[i].dep_order); - } - free(prog->array); -} - -/* Check if a gpu array is a scalar. A scalar is a value that is not stored - * as an array or through a pointer reference, but as a single data element. - * At the moment, scalars are represented as zero-dimensional arrays. - * Note that the single data element may be an entire structure. - */ -int gpu_array_is_scalar(struct gpu_array_info *array) -{ - return array->n_index == 0; -} - -/* Can "array" be mapped to private memory? - * That is, is it only accessed as individual elements with - * constant index expressions? - */ -isl_bool gpu_array_can_be_private(struct gpu_array_info *array) -{ - if (!array) - return isl_bool_error; - return array->only_fixed_element; -} - -/* Is "array" a read-only scalar? - */ -int gpu_array_is_read_only_scalar(struct gpu_array_info *array) -{ - return array->read_only_scalar; -} - -/* Does "array" need to be allocated on the device? - * If it is a read-only scalar, then it will be passed as an argument - * to the kernel and therefore does not require any allocation. - * If this device memory is not accessed at all, then it does not - * need to be allocated either. - */ -int gpu_array_requires_device_allocation(struct gpu_array_info *array) -{ - if (gpu_array_is_read_only_scalar(array)) - return 0; - if (!array->global) - return 0; - return 1; -} - -/* Return the set of parameter values for which the array has a positive - * size in all dimensions. - * If the sizes are only valid for some parameter values, then those - * constraints are also taken into account. - */ -__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array) -{ - int i; - isl_space *space; - isl_set *guard; - - if (!array) - return NULL; - - space = isl_space_params(isl_space_copy(array->space)); - guard = isl_set_universe(space); - - for (i = 0; i < array->n_index; ++i) { - isl_pw_aff *bound; - isl_set *guard_i, *zero; - - bound = isl_multi_pw_aff_get_pw_aff(array->bound, i); - guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound)); - zero = isl_pw_aff_zero_set(bound); - guard_i = isl_set_subtract(guard_i, zero); - guard = isl_set_intersect(guard, guard_i); - } - - return guard; -} - -/* Internal data structure for extract_size_of_type. - * "type" specifies the name of the space that we want to extract. - * "res" is used to store the subset of that space. - */ -struct ppcg_extract_size_data { - const char *type; - isl_set *res; -}; - -/* This function is called for each set in a union_set. - * If the name of the set matches data->type, we store the - * set in data->res. - */ -static isl_stat extract_size_of_type(__isl_take isl_set *size, void *user) -{ - struct ppcg_extract_size_data *data = user; - const char *name; - - name = isl_set_get_tuple_name(size); - if (name && !strcmp(name, data->type)) { - data->res = size; - return isl_stat_error; - } - - isl_set_free(size); - return isl_stat_ok; -} - -/* Given a union map { kernel[i] -> *[...] }, - * return the range in the space called "type" for the kernel with - * sequence number "id". - */ -static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes, - const char *type, int id) -{ - isl_space *space; - isl_set *dom; - isl_union_set *local_sizes; - struct ppcg_extract_size_data data = { type, NULL }; - - if (!sizes) - return NULL; - - space = isl_union_map_get_space(sizes); - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, 1); - space = isl_space_set_tuple_name(space, isl_dim_set, "kernel"); - dom = isl_set_universe(space); - dom = isl_set_fix_si(dom, isl_dim_set, 0, id); - - local_sizes = isl_union_set_apply(isl_union_set_from_set(dom), - isl_union_map_copy(sizes)); - isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data); - isl_union_set_free(local_sizes); - return data.res; -} - -/* Given a singleton set, extract the first (at most *len) elements - * of the single integer tuple into *sizes and update *len if needed. - */ -static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len) -{ - int i; - int dim; - - if (!set) - return; - - dim = isl_set_dim(set, isl_dim_set); - if (dim < *len) - *len = dim; - - for (i = 0; i < *len; ++i) { - isl_val *v; - - v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i); - assert(v); - - sizes[i] = isl_val_get_num_si(v); - isl_val_free(v); - } - - isl_set_free(set); -} - -/* Add the map { kernel[id] -> type[sizes] } to gen->used_sizes, - * if the option debug->dump_sizes is set. - */ -static void set_used_sizes(struct gpu_gen *gen, const char *type, int id, - int *sizes, int len) -{ - int i; - isl_space *space; - isl_map *map; - - if (!gen->options->debug->dump_sizes) - return; - - space = isl_union_map_get_space(gen->used_sizes); - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, 1); - space = isl_space_set_tuple_name(space, isl_dim_set, "kernel"); - space = isl_space_from_domain(space); - space = isl_space_add_dims(space, isl_dim_out, len); - space = isl_space_set_tuple_name(space, isl_dim_out, type); - - map = isl_map_universe(space); - map = isl_map_fix_si(map, isl_dim_in, 0, id); - for (i = 0; i < len; ++i) - map = isl_map_fix_si(map, isl_dim_out, i, sizes[i]); - - gen->used_sizes = isl_union_map_add_map(gen->used_sizes, map); -} - -/* Extract user specified "tile" sizes from the "sizes" command line option, - * defaulting to option->tile_size in each dimension. - * *tile_len contains the maximum number of tile sizes needed. - * Update *tile_len to the number of specified tile sizes, if any, and - * return a pointer to the tile sizes (or NULL on error). - * Add the effectively used sizes to gen->used_sizes. - */ -static int *read_tile_sizes(struct gpu_gen *gen, int *tile_len) -{ - int n; - int *tile_size; - isl_set *size; - - tile_size = isl_alloc_array(gen->ctx, int, *tile_len); - if (!tile_size) - return NULL; - for (n = 0; n < *tile_len; ++n) - tile_size[n] = gen->options->tile_size; - - size = extract_sizes(gen->sizes, "tile", gen->kernel_id); - read_sizes_from_set(size, tile_size, tile_len); - set_used_sizes(gen, "tile", gen->kernel_id, tile_size, *tile_len); - - return tile_size; -} - -/* Extract user specified "block" sizes from the "sizes" command line option, - * after filling in some potentially useful defaults. - */ -static void read_block_sizes(struct ppcg_kernel *kernel, - __isl_keep isl_union_map *sizes) -{ - isl_set *size; - - if (kernel->n_block > 3) - kernel->n_block = 3; - switch (kernel->n_block) { - case 1: - kernel->block_dim[0] = 512; - break; - case 2: - kernel->block_dim[0] = 32; - kernel->block_dim[1] = 16; - break; - default: - kernel->block_dim[0] = 32; - kernel->block_dim[1] = 4; - kernel->block_dim[2] = 4; - break; - } - - size = extract_sizes(sizes, "block", kernel->id); - read_sizes_from_set(size, kernel->block_dim, &kernel->n_block); -} - -/* Extract user specified "grid" sizes from the "sizes" command line option, - * after filling in some potentially useful defaults. - */ -static void read_grid_sizes(struct ppcg_kernel *kernel, - __isl_keep isl_union_map *sizes) -{ - isl_set *size; - - if (kernel->n_grid > 2) - kernel->n_grid = 2; - switch (kernel->n_grid) { - case 1: - kernel->grid_dim[0] = 32768; - break; - default: - kernel->grid_dim[0] = 256; - kernel->grid_dim[1] = 256; - break; - } - - size = extract_sizes(sizes, "grid", kernel->id); - read_sizes_from_set(size, kernel->grid_dim, &kernel->n_grid); -} - -/* Extract user specified grid and block sizes from the gen->sizes - * command line option after filling in some potentially useful defaults. - * Store the extracted sizes in "kernel". - * Add the effectively used sizes to gen->used_sizes. - */ -static void read_grid_and_block_sizes(struct ppcg_kernel *kernel, - struct gpu_gen *gen) -{ - read_block_sizes(kernel, gen->sizes); - read_grid_sizes(kernel, gen->sizes); - set_used_sizes(gen, "block", kernel->id, - kernel->block_dim, kernel->n_block); - set_used_sizes(gen, "grid", kernel->id, - kernel->grid_dim, kernel->n_grid); -} - -static void *free_stmts(struct gpu_stmt *stmts, int n) -{ - int i; - - if (!stmts) - return NULL; - - for (i = 0; i < n; ++i) { - struct gpu_stmt_access *access, *next; - - for (access = stmts[i].accesses; access; access = next) { - next = access->next; - isl_id_free(access->ref_id); - isl_map_free(access->access); - isl_map_free(access->tagged_access); - free(access); - } - - isl_id_free(stmts[i].id); - } - free(stmts); - - return NULL; -} - -/* Add parameters p[i] with identifiers "ids" to "set", - * with bounds to 0 <= p[i] < size[i]. - */ -__isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set, - int *size, __isl_keep isl_id_list *ids) -{ - int i, len; - unsigned nparam; - - len = isl_id_list_n_id(ids); - nparam = isl_set_dim(set, isl_dim_param); - set = isl_set_add_dims(set, isl_dim_param, len); - - for (i = 0; i < len; ++i) { - isl_id *id; - - id = isl_id_list_get_id(ids, i); - set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id); - set = isl_set_lower_bound_si(set, isl_dim_param, nparam + i, 0); - set = isl_set_upper_bound_si(set, isl_dim_param, - nparam + i, size[i] - 1); - } - - return set; -} - -/* Add "len" parameters p[i] with identifiers "ids" and intersect "set" - * with - * - * { : 0 <= p[i] < size[i] } - * - * or an overapproximation. - */ -static __isl_give isl_set *add_bounded_parameters_dynamic( - __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size, - __isl_keep isl_id_list *ids) -{ - int i, len; - unsigned nparam; - isl_space *space; - isl_local_space *ls; - - len = isl_multi_pw_aff_dim(size, isl_dim_out); - nparam = isl_set_dim(set, isl_dim_param); - set = isl_set_add_dims(set, isl_dim_param, len); - - for (i = 0; i < len; ++i) { - isl_id *id; - - id = isl_id_list_get_id(ids, i); - set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id); - } - - space = isl_space_params(isl_set_get_space(set)); - ls = isl_local_space_from_space(space); - for (i = 0; i < len; ++i) { - isl_pw_aff *param, *size_i, *zero; - isl_set *bound; - - param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls), - isl_dim_param, nparam + i); - - size_i = isl_multi_pw_aff_get_pw_aff(size, i); - bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i); - bound = isl_set_from_basic_set(isl_set_simple_hull(bound)); - set = isl_set_intersect_params(set, bound); - - zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls)); - bound = isl_pw_aff_ge_set(param, zero); - set = isl_set_intersect_params(set, bound); - } - isl_local_space_free(ls); - - return set; -} - -/* Return the union of all tagged access relations in the group. - */ -static __isl_give isl_union_map *group_tagged_access_relation( - struct gpu_array_ref_group *group) -{ - int i; - isl_union_map *access; - - access = isl_union_map_empty(isl_map_get_space(group->access)); - for (i = 0; i < group->n_ref; ++i) { - isl_map *map_i; - - map_i = isl_map_copy(group->refs[i]->tagged_access); - access = isl_union_map_union(access, - isl_union_map_from_map(map_i)); - } - - return access; -} - -/* Return the extent of "array", recomputed from the bounds. - * The recomputed extent may be simpler than the original extent. - */ -static __isl_give isl_set *array_extent(struct gpu_array_info *array) -{ - int i; - isl_id *id; - isl_space *space; - isl_local_space *ls; - isl_set *extent; - - id = isl_set_get_tuple_id(array->extent); - space = isl_set_get_space(array->extent); - extent = isl_set_universe(isl_space_copy(space)); - ls = isl_local_space_from_space(space); - for (i = 0; i < array->n_index; ++i) { - isl_pw_aff *bound; - isl_aff *aff; - isl_pw_aff *index; - isl_set *lt; - - extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0); - - aff = isl_aff_var_on_domain(isl_local_space_copy(ls), - isl_dim_set, i); - index = isl_pw_aff_from_aff(aff); - bound = isl_multi_pw_aff_get_pw_aff(array->bound, i); - bound = isl_pw_aff_from_range(bound); - bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index); - bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in, - isl_id_copy(id)); - lt = isl_pw_aff_lt_set(index, bound); - extent = isl_set_intersect(extent, lt); - } - isl_local_space_free(ls); - isl_id_free(id); - - return extent; -} - -/* Return a map from the first group->shared_tile->depth dimensions - * of the computed schedule to the array tile in - * global memory that corresponds to the shared memory copy. - * - * In particular, return a map - * - * { D[i] -> A[a] } - * - * with constraints - * - * tile_offset(i) <= a <= tile_offset(i) + tile_size - 1 (1) - * - * and - * - * 0 <= a <= array_size - 1 (2) - * - * Note that if some stride has been detected (i.e., when - * group->shared_tile->bound[i].shift is set), then a in (1) refers - * to the shifted and scaled down version. - * - * Constraints (1) are obtained by mapping the size constraints on the - * shared/private memory tile back to the access relation. - * Constraints (2) are obtained from the (recomputed) extent. - */ -static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group) -{ - int i; - int n_index = group->array->n_index; - isl_map *tile; - isl_space *space; - isl_set *local; - isl_set *extent; - - space = isl_multi_aff_get_space(group->shared_tile->tiling); - space = isl_space_range(space); - local = isl_set_universe(space); - for (i = 0; i < n_index; ++i) { - isl_val *bound; - - local = isl_set_lower_bound_si(local, isl_dim_set, i, 0); - bound = isl_val_copy(group->shared_tile->bound[i].size); - bound = isl_val_sub_ui(bound, 1); - local = isl_set_upper_bound_val(local, isl_dim_set, i, bound); - } - local = isl_set_preimage_multi_aff(local, - isl_multi_aff_copy(group->shared_tile->tiling)); - tile = isl_set_unwrap(local); - extent = array_extent(group->array); - tile = isl_map_intersect_range(tile, extent); - - return tile; -} - -/* Given a mapping "iterator_map" from the AST schedule to a domain, - * return the corresponding mapping from the AST schedule to - * to the outer kernel->copy_schedule_dim dimensions of - * the schedule computed by PPCG for this kernel. - * - * Note that kernel->copy_schedule_dim is at least as large as - * the largest depth of any array reference group associated to the kernel. - * This is needed as the returned schedule is used to extract a mapping - * to the outer tile->depth dimensions in transform_index. - */ -static __isl_give isl_pw_multi_aff *compute_sched_to_copy( - struct ppcg_kernel *kernel, __isl_take isl_pw_multi_aff *iterator_map) -{ - isl_union_pw_multi_aff *upma; - isl_pw_multi_aff *pma; - isl_space *space; - - space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map)); - space = isl_space_from_domain(space); - space = isl_space_add_dims(space, isl_dim_out, - kernel->copy_schedule_dim); - - upma = isl_union_pw_multi_aff_copy(kernel->copy_schedule); - pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space); - isl_union_pw_multi_aff_free(upma); - - return isl_pw_multi_aff_pullback_pw_multi_aff(pma, iterator_map); -} - -/* If max_shared_memory is not set to infinity (-1), then make - * sure that the total amount of shared memory required by the - * array reference groups mapped to shared memory by "kernel" - * is no larger than this maximum. - * - * We apply a greedy approach and discard (keep in global memory) - * those groups that would result in a total memory size that - * is larger than the maximum. - * - * This function should be called after any function that may - * affect the decision on whether to place a reference group - * in private, shared or global memory. - */ -static void check_shared_memory_bound(struct ppcg_kernel *kernel) -{ - int i, j; - isl_val *left, *size; - - if (kernel->options->max_shared_memory < 0) - return; - - left = isl_val_int_from_si(kernel->ctx, - kernel->options->max_shared_memory); - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *local = &kernel->array[i]; - - for (j = 0; j < local->n_group; ++j) { - struct gpu_array_ref_group *group; - enum ppcg_group_access_type type; - - group = local->groups[j]; - type = gpu_array_ref_group_type(group); - if (type != ppcg_access_shared) - continue; - - size = gpu_array_tile_size(group->shared_tile); - size = isl_val_mul_ui(size, local->array->size); - - if (isl_val_le(size, left)) { - left = isl_val_sub(left, size); - continue; - } - isl_val_free(size); - - group->shared_tile = - gpu_array_tile_free(group->shared_tile); - } - } - - isl_val_free(left); -} - -/* Mark all arrays of "kernel" that have an array reference group - * that is not mapped to private or shared memory as - * accessing the corresponding global device memory. - */ -static void mark_global_arrays(struct ppcg_kernel *kernel) -{ - int i, j; - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *local = &kernel->array[i]; - - if (local->global) - continue; - for (j = 0; j < local->n_group; ++j) { - if (gpu_array_ref_group_tile(local->groups[j])) - continue; - - local->global = 1; - local->array->global = 1; - break; - } - } -} - -/* Compute a tiling for all the array reference groups in "kernel". - */ -static void compute_group_tilings(struct ppcg_kernel *kernel) -{ - int i, j; - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *array = &kernel->array[i]; - - for (j = 0; j < array->n_group; ++j) - gpu_array_ref_group_compute_tiling(array->groups[j]); - } -} - -/* Compute the effective grid size as a list of the sizes in each dimension. - * - * The grid size specified by the user or set by default - * in read_grid_sizes() and applied by the block filter, - * may be too large for the given code in the sense that - * it may contain blocks that don't need to execute anything. - * We therefore don't return this grid size, but instead the - * smallest grid size that ensures that all blocks that actually - * execute code are included in the grid. - * - * We first extract a description of the grid, i.e., the possible values - * of the block ids, from the domain elements in "domain" and - * kernel->block_filter. - * The block ids are parameters in kernel->block_filter. - * We simply need to change them into set dimensions. - * - * Then, for each block dimension, we compute the maximal value of the block id - * and add one. - */ -static __isl_give isl_multi_pw_aff *extract_grid_size( - struct ppcg_kernel *kernel, __isl_take isl_union_set *domain) -{ - int i; - isl_set *grid; - isl_set *context; - isl_multi_pw_aff *size; - - domain = isl_union_set_intersect(domain, - isl_union_set_copy(kernel->block_filter)); - grid = isl_union_set_params(domain); - grid = isl_set_from_params(grid); - grid = isl_set_add_dims(grid, isl_dim_set, kernel->n_grid); - for (i = 0; i < kernel->n_grid; ++i) { - int pos; - isl_id *id; - - id = isl_id_list_get_id(kernel->block_ids, i); - pos = isl_set_find_dim_by_id(grid, isl_dim_param, id); - isl_id_free(id); - assert(pos >= 0); - grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i); - grid = isl_set_project_out(grid, isl_dim_param, pos, 1); - } - - grid = isl_set_coalesce(grid); - size = ppcg_size_from_extent(grid); - context = isl_set_params(isl_set_copy(kernel->context)); - return isl_multi_pw_aff_gist(size, context); -} - -/* Compute the size of a fixed bounding box around the origin and "set", - * where "set" is assumed to contain only non-negative elements, - * and store the results in "size". - * In particular, compute the maximal value of "set" in each direction - * and add one. - */ -static void extract_fixed_size(__isl_take isl_set *set, int *size) -{ - int i, n; - isl_local_space *ls; - isl_aff *obj; - - n = isl_set_dim(set, isl_dim_set); - ls = isl_local_space_from_space(isl_set_get_space(set)); - obj = isl_aff_zero_on_domain(ls); - for (i = 0; i < n; ++i) { - isl_val *max; - - obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1); - max = isl_set_max_val(set, obj); - size[i] = isl_val_get_num_si(max) + 1; - isl_val_free(max); - obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0); - } - isl_aff_free(obj); - isl_set_free(set); -} - -/* Compute the effective block size as a list of the sizes in each dimension - * and store the sizes in kernel->block_dim. - * - * The block size specified by the user or set by default - * in read_block_sizes() and applied by the thread filter, - * may be too large for the given code in the sense that - * it may contain threads that don't need to execute anything. - * We therefore update this block size in kernel->block_dim - * to the smallest block size that ensures that all threads - * that actually execute code are included in the block. - * - * The set of possible values of the thread ids is obtained from - * the domain elements "domain" and kernel->thread_filter. - * The current implementation eliminates all parameters, ensuring - * that the size is a fixed constant in each dimension. - * In principle we could also compute parametric sizes. - * We would have to make sure to project out all b%d and t%d parameters, - * however. - */ -static isl_stat extract_block_size(struct ppcg_kernel *kernel, - __isl_take isl_union_set *domain) -{ - int i; - int nparam; - isl_set *block; - - domain = isl_union_set_intersect(domain, - isl_union_set_copy(kernel->thread_filter)); - block = isl_union_set_params(domain); - block = isl_set_from_params(block); - block = isl_set_add_dims(block, isl_dim_set, kernel->n_block); - for (i = 0; i < kernel->n_block; ++i) { - int pos; - isl_id *id; - - if (!block) - return isl_stat_error; - - id = isl_id_list_get_id(kernel->thread_ids, i); - pos = isl_set_find_dim_by_id(block, isl_dim_param, id); - isl_id_free(id); - if (pos < 0) - isl_die(isl_set_get_ctx(block), isl_error_internal, - "missing constraints on thread identifier", - block = isl_set_free(block)); - block = isl_set_equate(block, isl_dim_param, pos, - isl_dim_set, i); - } - nparam = isl_set_dim(block, isl_dim_param); - block = isl_set_project_out(block, isl_dim_param, 0, nparam); - - if (!block) - return isl_stat_error; - - extract_fixed_size(block, kernel->block_dim); - - return isl_stat_ok; -} - -struct ppcg_kernel *ppcg_kernel_free(struct ppcg_kernel *kernel) -{ - int i, j; - - if (!kernel) - return NULL; - - isl_id_list_free(kernel->block_ids); - isl_id_list_free(kernel->thread_ids); - isl_multi_pw_aff_free(kernel->grid_size); - isl_ast_expr_free(kernel->grid_size_expr); - isl_set_free(kernel->context); - isl_union_set_free(kernel->core); - isl_union_set_free(kernel->arrays); - isl_union_pw_multi_aff_free(kernel->contraction); - isl_union_set_free(kernel->expanded_domain); - isl_space_free(kernel->space); - isl_ast_node_free(kernel->tree); - isl_union_set_free(kernel->block_filter); - isl_union_set_free(kernel->thread_filter); - isl_union_pw_multi_aff_free(kernel->copy_schedule); - isl_union_set_free(kernel->sync_writes); - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *array = &kernel->array[i]; - - for (j = 0; j < array->n_group; ++j) - gpu_array_ref_group_free(array->groups[j]); - free(array->groups); - - isl_multi_pw_aff_free(array->bound); - isl_ast_expr_free(array->bound_expr); - } - free(kernel->array); - - for (i = 0; i < kernel->n_var; ++i) { - free(kernel->var[i].name); - isl_vec_free(kernel->var[i].size); - } - free(kernel->var); - - free(kernel); - - return NULL; -} - -/* Wrapper around ppcg_kernel_free for use as a isl_id_set_free_user callback. - */ -static void ppcg_kernel_free_wrap(void *user) -{ - struct ppcg_kernel *kernel = user; - - ppcg_kernel_free(kernel); -} - -static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group, - struct ppcg_kernel_var *var) -{ - int j; - struct gpu_array_tile *tile; - isl_printer *p; - - var->array = group->array; - - var->type = gpu_array_ref_group_type(group); - tile = gpu_array_ref_group_tile(group); - - p = isl_printer_to_str(ctx); - p = gpu_array_ref_group_print_name(group, p); - var->name = isl_printer_get_str(p); - isl_printer_free(p); - - var->size = isl_vec_alloc(ctx, group->array->n_index); - - for (j = 0; j < group->array->n_index; ++j) - var->size = isl_vec_set_element_val(var->size, j, - isl_val_copy(tile->bound[j].size)); -} - -static int create_kernel_vars(struct ppcg_kernel *kernel) -{ - int i, j, n; - - n = 0; - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *array = &kernel->array[i]; - - for (j = 0; j < array->n_group; ++j) { - struct gpu_array_ref_group *group = array->groups[j]; - enum ppcg_group_access_type type; - - type = gpu_array_ref_group_type(group); - if (type != ppcg_access_global) - ++n; - } - } - - kernel->n_var = n; - kernel->var = isl_calloc_array(kernel->ctx, struct ppcg_kernel_var, n); - if (!kernel->var) - return -1; - - n = 0; - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *array = &kernel->array[i]; - - for (j = 0; j < array->n_group; ++j) { - struct gpu_array_ref_group *group = array->groups[j]; - enum ppcg_group_access_type type; - - type = gpu_array_ref_group_type(group); - if (type == ppcg_access_global) - continue; - create_kernel_var(kernel->ctx, group, &kernel->var[n]); - ++n; - } - } - - return 0; -} - -/* Replace "pa" by the zero function defined over the universe domain - * in the space of "pa". - */ -static __isl_give isl_pw_aff *set_universally_zero(__isl_take isl_pw_aff *pa) -{ - isl_space *space; - isl_aff *zero; - - space = isl_space_domain(isl_pw_aff_get_space(pa)); - isl_pw_aff_free(pa); - zero = isl_aff_zero_on_domain(isl_local_space_from_space(space)); - - return isl_pw_aff_from_aff(zero); -} - -/* The sizes of the arrays on the host that have been computed by - * extract_array_info may depend on the parameters. Use the extra - * constraints on the parameters that are valid at "host_domain" - * to simplify these expressions and store the results in kernel->array. - * - * We only need these localized bounds for arrays that are accessed - * by the current kernel. If we have found at least one reference group - * then the array is accessed by the kernel. - * - * The resulting sizes may be functions that are nowhere defined - * in case the access function cannot possibly access anything inside - * the kernel for some reason. If so, they are replaced by the zero - * function. Since the access function cannot actually access anything, - * there is no harm in printing the array sizes as zero. - */ -static void localize_bounds(struct ppcg_kernel *kernel, - __isl_keep isl_set *host_domain) -{ - int i, j; - isl_set *context; - - context = isl_set_copy(host_domain); - context = isl_set_params(context); - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *local = &kernel->array[i]; - isl_multi_pw_aff *bound; - int n_index; - - if (local->n_group == 0) - continue; - - n_index = local->array->n_index; - bound = isl_multi_pw_aff_copy(local->array->bound); - - for (j = 0; j < n_index; ++j) { - isl_pw_aff *pwaff; - int empty; - - pwaff = isl_multi_pw_aff_get_pw_aff(bound, j); - pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context)); - empty = isl_pw_aff_is_empty(pwaff); - if (empty < 0) - pwaff = isl_pw_aff_free(pwaff); - else if (empty) - pwaff = set_universally_zero(pwaff); - bound = isl_multi_pw_aff_set_pw_aff(bound, j, pwaff); - } - - local->n_index = n_index; - local->bound = bound; - } - isl_set_free(context); -} - -/* Create the array of gpu_local_array_info structures "array" - * inside "kernel". The number of elements in this array is - * the same as the number of arrays in "prog". - * Initialize the "array" field of each local array to point - * to the corresponding array in "prog". - */ -static struct ppcg_kernel *ppcg_kernel_create_local_arrays( - struct ppcg_kernel *kernel, struct gpu_prog *prog) -{ - int i; - isl_ctx *ctx; - - ctx = isl_set_get_ctx(prog->context); - kernel->array = isl_calloc_array(ctx, - struct gpu_local_array_info, prog->n_array); - if (!kernel->array) - return ppcg_kernel_free(kernel); - kernel->n_array = prog->n_array; - - for (i = 0; i < prog->n_array; ++i) - kernel->array[i].array = &prog->array[i]; - - return kernel; -} - -/* Does "kernel" need to be passed an argument corresponding to array "i"? - * - * The argument is only needed if the kernel accesses this device memory. - */ -int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i) -{ - return kernel->array[i].global; -} - -/* Find the element in gen->stmt that has the given "id". - * Return NULL if no such gpu_stmt can be found. - */ -static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id) -{ - int i; - - for (i = 0; i < prog->n_stmts; ++i) { - if (id == prog->stmts[i].id) - break; - } - - return i < prog->n_stmts ? &prog->stmts[i] : NULL; -} - -void ppcg_kernel_stmt_free(void *user) -{ - struct ppcg_kernel_stmt *stmt = user; - - if (!stmt) - return; - - switch (stmt->type) { - case ppcg_kernel_copy: - isl_ast_expr_free(stmt->u.c.index); - isl_ast_expr_free(stmt->u.c.local_index); - break; - case ppcg_kernel_domain: - isl_id_to_ast_expr_free(stmt->u.d.ref2expr); - break; - case ppcg_kernel_sync: - break; - } - - free(stmt); -} - -/* Return the gpu_stmt_access in the list "accesses" that corresponds - * to "ref_id". - */ -static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses, - __isl_keep isl_id *ref_id) -{ - struct gpu_stmt_access *access; - - for (access = accesses; access; access = access->next) - if (access->ref_id == ref_id) - return access; - - return NULL; -} - -/* Return the index of the array called "name" in the list of arrays. - */ -static int find_array_index(struct ppcg_kernel *kernel, const char *name) -{ - int i; - - for (i = 0; i < kernel->n_array; ++i) - if (!strcmp(name, kernel->array[i].array->name)) - return i; - - return -1; -} - -/* Internal data structure for the index and AST expression transformation - * callbacks for pet_stmt_build_ast_exprs. - * - * "kernel" is the kernel for which are computing AST expressions and - * may be NULL if we are not inside a kernel. - * "accesses" is the list of gpu_stmt_access in the statement. - * "iterator_map" expresses the statement iterators in terms of - * the AST loop iterators. - * "sched2copy" expresses the outer copy_schedule_dim dimensions of - * the kernel schedule in terms of the AST loop iterators and - * may be NULL if we are not inside a kernel. - * - * The following fields are set in transform_index and used in transform_expr. - * "array" is the array that is being accessed. - * "global" is set if the global array is accessed (rather than - * shared/private memory). - * "local_array" refers to information on the array specialized - * to the current kernel. - */ -struct ppcg_transform_data { - struct ppcg_options *options; - struct ppcg_kernel *kernel; - struct gpu_stmt_access *accesses; - isl_pw_multi_aff *iterator_map; - isl_pw_multi_aff *sched2copy; - - struct gpu_array_info *array; - int global; - struct gpu_local_array_info *local_array; -}; - -/* Return a pointer to the gpu_array_ref_group in "local" - * that contains the reference "access". - * Return NULL if no such group can be found. - */ -static struct gpu_array_ref_group *find_ref_group( - struct gpu_local_array_info *local, struct gpu_stmt_access *access) -{ - int i, j; - - for (i = 0; i < local->n_group; ++i) { - struct gpu_array_ref_group *group = local->groups[i]; - - for (j = 0; j < group->n_ref; ++j) - if (group->refs[j] == access) - return group; - } - - return NULL; -} - -/* Given an index expression "index" of the form - * - * L -> F(A), - * - * with F(A) either A or some subfield of A and L the AST loop iterators, - * and a tiling "tiling" of the form - * - * [L -> A] -> T - * - * apply the tiling to the outer array in the index expression to obtain - * - * L -> T(A) - * - * If F(A) is some subfield of A, then separate the member access - * into the base index expression and the field index expression, - * apply the tiling to the base index expression and combine the result - * with the field index expression. - * - * If F(A) is A, then modify index to keep track of the iterators - * - * L -> [L -> A] - * - * and combine the result with the tiling to obtain a tiled index expression - * in terms of the AST loop iterators - * - * L -> T - */ -static __isl_give isl_multi_pw_aff *tile_outer( - __isl_take isl_multi_pw_aff *index, __isl_take isl_multi_pw_aff *tiling) -{ - isl_bool is_wrapping; - isl_space *space; - isl_multi_pw_aff *mpa; - - is_wrapping = isl_multi_pw_aff_range_is_wrapping(index); - if (is_wrapping < 0) - goto error; - if (is_wrapping) { - isl_multi_pw_aff *field; - - field = isl_multi_pw_aff_copy(index); - field = isl_multi_pw_aff_range_factor_range(field); - index = isl_multi_pw_aff_range_factor_domain(index); - index = tile_outer(index, tiling); - return isl_multi_pw_aff_range_product(index, field); - } - - space = isl_space_domain(isl_multi_pw_aff_get_space(index)); - space = isl_space_map_from_set(space); - mpa = isl_multi_pw_aff_identity(space); - index = isl_multi_pw_aff_range_product(mpa, index); - index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index); - - return index; -error: - isl_multi_pw_aff_free(index); - isl_multi_pw_aff_free(tiling); - return NULL; -} - -/* Index transformation callback for pet_stmt_build_ast_exprs. - * - * "index" expresses the array indices in terms of statement iterators - * - * We first reformulate "index" in terms of the AST loop iterators. - * Then we check if we are accessing the global array or - * a shared/private copy. In particular, if we are not inside a kernel - * then we must be accessing a global array. - * In the former case, we simply return - * the updated index. If "index" is an affine expression rather - * than an array access, then we also return the updated index here. - * - * If no reference groups have been computed for the array, - * then we can only be accessing the global array. - * - * Otherwise, we apply the tiling to the index. - * This tiling is of the form - * - * [D -> A] -> T - * - * where D corresponds to the outer tile->depth dimensions of - * the kernel schedule. - * The index is of the form - * - * L -> A - * - * We update the tiling to refer to the AST loop iterators - * - * [L -> A] -> T - * - * and combine it with the index to obtain a tiled index expression in terms - * of the AST loop iterators - * - * L -> T - * - * Note that while the tiling applies directly to an outer array. - * the index may refer to some subfield of this outer array. - * In such cases, the result will refer to the same subfield of the tile. - * That is, an index expression of the form L -> F(A) will be transformed - * into an index expression of the form L -> F(T). - */ -static __isl_give isl_multi_pw_aff *transform_index( - __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id, - void *user) -{ - struct ppcg_transform_data *data = user; - struct gpu_stmt_access *access; - struct gpu_array_ref_group *group; - struct gpu_array_tile *tile; - isl_pw_multi_aff *iterator_map; - int i; - int dim; - const char *name; - isl_space *space; - isl_multi_pw_aff *tiling; - isl_pw_multi_aff *pma; - isl_pw_multi_aff *sched2depth; - - data->array = NULL; - - iterator_map = isl_pw_multi_aff_copy(data->iterator_map); - index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map); - - if (!data->kernel) - return index; - - access = find_access(data->accesses, ref_id); - if (!access) - return index; - if (!isl_map_has_tuple_name(access->access, isl_dim_out)) - return index; - - name = get_outer_array_name(access->access); - i = find_array_index(data->kernel, name); - if (i < 0) - isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal, - "cannot find array", - return isl_multi_pw_aff_free(index)); - data->local_array = &data->kernel->array[i]; - data->array = data->local_array->array; - - group = find_ref_group(data->local_array, access); - if (!group) { - data->global = 1; - return index; - } - - tile = gpu_array_ref_group_tile(group); - data->global = !tile; - if (!tile) - return index; - - space = isl_space_domain(isl_multi_aff_get_space(tile->tiling)); - space = isl_space_range(isl_space_unwrap(space)); - space = isl_space_map_from_set(space); - pma = isl_pw_multi_aff_identity(space); - sched2depth = isl_pw_multi_aff_copy(data->sched2copy); - dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out); - sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out, - tile->depth, dim - tile->depth); - pma = isl_pw_multi_aff_product(sched2depth, pma); - tiling = isl_multi_pw_aff_from_multi_aff( - isl_multi_aff_copy(tile->tiling)); - tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma); - - index = tile_outer(index, tiling); - - return index; -} - -/* Dereference "expr" by adding an index [0]. - * The original "expr" is assumed not to have any indices. - * - * If "expr" is a member access, then the dereferencing needs - * to be applied to the structure argument of this member access. - */ -static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr) -{ - isl_ctx *ctx; - isl_ast_expr *arg0, *res; - isl_ast_expr_list *list; - - arg0 = isl_ast_expr_get_op_arg(expr, 0); - if (!arg0) - return isl_ast_expr_free(expr); - if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op && - isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) { - isl_ast_expr *arg; - - arg = isl_ast_expr_get_op_arg(arg0, 0); - arg = dereference(arg); - arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg); - expr = isl_ast_expr_set_op_arg(expr, 0, arg0); - - return expr; - } - isl_ast_expr_free(arg0); - - ctx = isl_ast_expr_get_ctx(expr); - res = isl_ast_expr_from_val(isl_val_zero(ctx)); - list = isl_ast_expr_list_from_ast_expr(res); - res = isl_ast_expr_get_op_arg(expr, 0); - res = isl_ast_expr_access(res, list); - isl_ast_expr_free(expr); - - return res; -} - -/* Linearize the index expression "expr" based on the array bounds - * of "array". - * - * That is, transform expression - * - * A[i_0][i_1]...[i_n] - * - * to - * - * A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n] - * - * where b_0, b_1, ..., b_n are the bounds on the array. - * - * If the base of "expr" is a member access, then the linearization needs - * to be applied to the structure argument of this member access. - * - * In the base case, if "expr" has no arguments (other than the name of - * the array), then we are passing an entire array to a function. - * In this case, there is nothing to linearize. - * Note that at this point an expression with no arguments can - * only be an entire array because the scalar case and - * the case of single struct are handled by the caller. - * - * If the number of specified index expressions in "expr" - * is smaller than the dimension of the accessed array, - * then the missing i_j also do not appear in the linearized expression. - * Furthermore, since such an expression does not refer to a single - * element while the default linearized expression would refer to - * a single element, we return the expression - * - * A + (..((i_0 * b_1 + i_1) ... ) * b_l + i_l) - * - * instead. Note that because of the special case handling above, - * we can assume here that there is at least one index expression. - */ -__isl_give isl_ast_expr *gpu_local_array_info_linearize_index( - struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr) -{ - int i, n; - isl_ast_expr *arg0; - isl_ast_expr *res; - isl_ast_expr_list *list; - - arg0 = isl_ast_expr_get_op_arg(expr, 0); - if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op && - isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) { - isl_ast_expr *arg; - - arg = isl_ast_expr_get_op_arg(arg0, 0); - arg = gpu_local_array_info_linearize_index(array, arg); - arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg); - expr = isl_ast_expr_set_op_arg(expr, 0, arg0); - - return expr; - } - isl_ast_expr_free(arg0); - - if (isl_ast_expr_get_op_n_arg(expr) == 1) - return expr; - - n = isl_ast_expr_get_op_n_arg(expr); - res = isl_ast_expr_get_op_arg(expr, 1); - for (i = 1; i < array->n_index; ++i) { - isl_ast_expr *expr_i; - - expr_i = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i); - res = isl_ast_expr_mul(res, expr_i); - - if (i + 1 >= n) - continue; - expr_i = isl_ast_expr_get_op_arg(expr, i + 1); - res = isl_ast_expr_add(res, expr_i); - } - - if (1 + array->n_index > n) { - res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res); - } else { - list = isl_ast_expr_list_from_ast_expr(res); - res = isl_ast_expr_get_op_arg(expr, 0); - res = isl_ast_expr_access(res, list); - } - - isl_ast_expr_free(expr); - - return res; -} - -/* AST expression transformation callback for pet_stmt_build_ast_exprs. - * - * If the AST expression refers to an array that is not accessed - * at all, then this means the value of the expression is not used, - * so we might as well print zero (NULL pointer) instead. - * - * If the AST expression refers to a global scalar that is not - * a read-only scalar, then its address was passed to the kernel and - * we need to dereference it. - * - * If the AST expression refers to an access to a global array, - * then we linearize the access exploiting the bounds in data->local_array. - */ -static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr, - __isl_keep isl_id *id, void *user) -{ - struct ppcg_transform_data *data = user; - - if (!data->array) - return expr; - if (!data->array->accessed) { - isl_ctx *ctx; - - ctx = isl_ast_expr_get_ctx(expr); - isl_ast_expr_free(expr); - return isl_ast_expr_from_val(isl_val_zero(ctx)); - } - if (gpu_array_is_read_only_scalar(data->array)) - return expr; - if (!data->global) - return expr; - if (data->array->n_index == 0) - return dereference(expr); - if (!data->array->linearize) - return expr; - - return gpu_local_array_info_linearize_index(data->local_array, expr); -} - -/* This function is called for each instance of a user statement - * in the kernel "kernel", identified by "gpu_stmt". - * "kernel" may be NULL if we are not inside a kernel. - * - * We attach a struct ppcg_kernel_stmt to the "node", containing - * a computed AST expression for each access, through an annotation - * with name "user". - * These AST expressions are computed from iterator_map, - * which expresses the domain - * elements in terms of the generated loops, and sched2copy, - * which expresses the outer copy_schedule_dim dimensions of - * the kernel schedule computed by PPCG in terms of the generated loops. - */ -static __isl_give isl_ast_node *create_domain_leaf( - struct ppcg_kernel *kernel, __isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt, - struct gpu_gen *gen) -{ - struct ppcg_transform_data data; - struct ppcg_kernel_stmt *stmt; - isl_ctx *ctx; - isl_id *id; - isl_pw_multi_aff *sched2copy; - isl_map *map; - isl_pw_multi_aff *iterator_map; - isl_union_map *schedule; - - if (!node) - return NULL; - ctx = isl_ast_node_get_ctx(node); - - stmt = isl_calloc_type(ctx, struct ppcg_kernel_stmt); - if (!stmt) - return isl_ast_node_free(node); - - schedule = isl_ast_build_get_schedule(build); - map = isl_map_reverse(isl_map_from_union_map(schedule)); - iterator_map = isl_pw_multi_aff_from_map(map); - if (kernel) - sched2copy = compute_sched_to_copy(kernel, - isl_pw_multi_aff_copy(iterator_map)); - else - sched2copy = NULL; - - stmt->type = ppcg_kernel_domain; - stmt->u.d.stmt = gpu_stmt; - - data.kernel = kernel; - data.accesses = stmt->u.d.stmt->accesses; - data.iterator_map = iterator_map; - data.sched2copy = sched2copy; - stmt->u.d.ref2expr = gen->build_ast_expr(stmt->u.d.stmt->stmt, - build, &transform_index, &data, - &transform_expr, &data); - - isl_pw_multi_aff_free(iterator_map); - isl_pw_multi_aff_free(sched2copy); - - id = isl_id_alloc(ctx, "user", stmt); - id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); - return isl_ast_node_set_annotation(node, id); -} - -/* This function is called for each statement node in the AST - * for copying to or from shared/private memory. - * Attach a pointer to a ppcg_kernel_stmt representing the copy - * statement to the node. - * The statement name is "read" or "write", depending on whether we are - * reading from global memory or writing to global memory. - * - * The schedule is of the form - * - * type[D -> A] -> L - * - * where D corresponds to the outer tile->depth dimensions of - * the kernel schedule, A to the global array and L to the outer - * generated AST schedule. - * We compute the inverse and strip off the type, resulting in - * - * L -> [D -> A] - * - * We combine this mapping with on the one hand the projection - * - * [D -> A] -> A - * - * and on the other hand the group tiling - * - * [D -> A] -> T - * - * resulting in - * - * L -> A and L -> T - * - * and store the corresponding expressions in stmt->index and stmt->local_index, - * where stmt points to the ppcg_kernel_stmt that is attached to the node. - * stmt->index is linearized if the global memory array is linearized. - */ -static __isl_give isl_ast_node *create_access_leaf(struct ppcg_kernel *kernel, - struct gpu_array_ref_group *group, __isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build) -{ - struct ppcg_kernel_stmt *stmt; - struct gpu_array_tile *tile; - isl_id *id; - isl_ast_expr *expr; - isl_space *space; - isl_map *access; - isl_pw_multi_aff *pma, *pma2; - const char *type; - - stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt); - if (!stmt) - return isl_ast_node_free(node); - - access = isl_map_from_union_map(isl_ast_build_get_schedule(build)); - type = isl_map_get_tuple_name(access, isl_dim_in); - stmt->u.c.read = !strcmp(type, "read"); - access = isl_map_reverse(access); - pma = isl_pw_multi_aff_from_map(access); - pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out); - - space = isl_space_range(isl_pw_multi_aff_get_space(pma)); - space = isl_space_unwrap(space); - pma2 = isl_pw_multi_aff_range_map(space); - pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, - isl_pw_multi_aff_copy(pma)); - expr = isl_ast_build_access_from_pw_multi_aff(build, pma2); - if (group->array->linearize) - expr = gpu_local_array_info_linearize_index(group->local_array, - expr); - stmt->u.c.index = expr; - - tile = gpu_array_ref_group_tile(group); - pma2 = isl_pw_multi_aff_from_multi_aff( - isl_multi_aff_copy(tile->tiling)); - pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, pma); - expr = isl_ast_build_access_from_pw_multi_aff(build, pma2); - stmt->u.c.local_index = expr; - - stmt->u.c.array = group->array; - stmt->u.c.local_array = group->local_array; - stmt->type = ppcg_kernel_copy; - - id = isl_id_alloc(kernel->ctx, "copy", stmt); - id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); - return isl_ast_node_set_annotation(node, id); -} - -/* Create a synchronization ppcg_kernel_stmt and - * attach it to the node "node" representing the synchronization. - */ -static __isl_give isl_ast_node *create_sync_leaf( - struct ppcg_kernel *kernel, __isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build) -{ - struct ppcg_kernel_stmt *stmt; - isl_id *id; - - stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt); - if (!stmt) - return isl_ast_node_free(node); - - stmt->type = ppcg_kernel_sync; - id = isl_id_alloc(kernel->ctx, "sync", stmt); - id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); - return isl_ast_node_set_annotation(node, id); -} - -/* Build AST expressions for the device array sizes of all arrays in "prog" - * that require allocation on the device using "build", as well as - * for the original array sizes of all arrays that need to be declared - * on the host. - * "node" is freed in case of error. - */ -static __isl_give isl_ast_node *build_array_bounds( - __isl_take isl_ast_node *node, struct gpu_prog *prog, - __isl_keep isl_ast_build *build) -{ - int i; - - for (i = 0; i < prog->n_array; ++i) { - struct gpu_array_info *array = &prog->array[i]; - isl_multi_pw_aff *size; - isl_ast_expr *expr; - - if (!gpu_array_requires_device_allocation(array)) - continue; - - size = isl_multi_pw_aff_copy(array->bound); - expr = ppcg_build_size_expr(size, build); - array->bound_expr = expr; - if (!expr) - return isl_ast_node_free(node); - } - - for (i = 0; i < prog->n_array; ++i) { - struct gpu_array_info *array = &prog->array[i]; - isl_set *extent; - isl_multi_pw_aff *size; - isl_ast_expr *expr; - - if (!array->declare_local) - continue; - extent = isl_set_copy(array->declared_extent); - size = ppcg_size_from_extent(extent); - expr = ppcg_build_size_expr(size, build); - array->declared_size = expr; - if (!expr) - return isl_ast_node_free(node); - } - - return node; -} - -/* Internal data structure for at_domain. - * - * "prog" represents the entire scop. - * "kernel" points to the kernel to which the current schedule node - * belongs. It is set by before_mark and reset by after_mark. - * It may be NULL if we are outside any kernel. - */ -struct ppcg_at_domain_data { - struct gpu_prog *prog; - struct gpu_gen *gen; - struct ppcg_kernel *kernel; -}; - -/* This function is called for each instance of a user statement - * in the kernel. This may be one of the original user statements - * or a statement introduced by PPCG. - * - * We first check if the statement id corresponds to a gpu statement, - * which indicates the statement is an original user statement. Any statement - * that is not an original user statement has been introduced by PPCG and - * requires special handling. - * - * If the user statement is one of the original user statements, then we call - * create_domain_leaf. If it is "init_device", then we call - * build_array_bounds. Otherwise, we check if it is a copy or synchronization - * statement and call the appropriate functions. Statements that copy an array - * to/from the device do not need any further treatment. - * Neither does "clear_device". - */ -static __isl_give isl_ast_node *at_domain(__isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build, void *user) -{ - struct ppcg_at_domain_data *data = user; - struct gpu_stmt *gpu_stmt; - isl_ast_expr *expr, *arg; - isl_id *id; - int is_sync; - const char *name; - void *p; - - expr = isl_ast_node_user_get_expr(node); - arg = isl_ast_expr_get_op_arg(expr, 0); - id = isl_ast_expr_get_id(arg); - name = isl_id_get_name(id); - p = isl_id_get_user(id); - isl_ast_expr_free(expr); - isl_ast_expr_free(arg); - - gpu_stmt = find_stmt(data->prog, id); - is_sync = gpu_tree_id_is_sync(id, data->kernel); - isl_id_free(id); - - if (gpu_stmt) - return create_domain_leaf(data->kernel, node, build, gpu_stmt, - data->gen); - - if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_")) - return node; - if (!strcmp(name, "init_device")) - return build_array_bounds(node, data->prog, build); - if (!strcmp(name, "clear_device")) - return node; - if (is_sync < 0) - return isl_ast_node_free(node); - if (!strcmp(name, "read") || !strcmp(name, "write")) { - struct gpu_array_ref_group *group = p; - return create_access_leaf(data->kernel, group, node, build); - } - if (!is_sync) - isl_die(data->prog->ctx, isl_error_internal, - "unknown statement type", - return isl_ast_node_free(node)); - return create_sync_leaf(data->kernel, node, build); -} - -/* Given a set of wrapped references "ref", return the corresponding - * access relations based on the tagged access relations "tagged". - * - * The elements of "ref" are of the form - * - * [D -> R] - * - * with D an iteration domains and R a reference. - * The elements of "tagged" are of the form - * - * [D -> R] -> A - * - * with A an array. - * - * Extend "tagged" to include the iteration domain in the range, i.e., - * - * [D -> R] -> [D -> A] - * - * apply the result to "ref" and then unwrap the resulting set - * to obtain relations of the form - * - * D -> A - */ -static __isl_give isl_union_map *wrapped_reference_to_access( - __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged) -{ - isl_union_map *tag2access; - - tag2access = isl_union_map_copy(tagged); - tag2access = isl_union_map_universe(tag2access); - tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access)); - tag2access = isl_union_map_domain_map(tag2access); - tag2access = isl_union_map_range_product(tag2access, tagged); - - ref = isl_union_set_coalesce(ref); - ref = isl_union_set_apply(ref, tag2access); - - return isl_union_set_unwrap(ref); -} - -/* Given an access relation "access" from one or more array reference groups, - * remove those reads if ("read" is 1) or writes (if "read" is 0) - * that are only needed to communicate data within - * the same iteration of "sched". - * The domain of "sched" corresponds to the original statement instances, - * i.e., those that appear in the domains of the access relations. - * "tagged" contains all tagged access relations to all - * the array reference groups accessed by "access" from statement - * instances scheduled by "sched". - * - * If the access is a read then it is either an element of - * - * live_in union (range flow) - * - * where live_in and flow may be overapproximations, or - * it reads an uninitialized value (that is not live-in because - * there is an intermediate kill) or it reads a value that was - * written within the same (compound) statement instance. - * If the access is a write then it is either an element of - * - * live_out union (domain flow) - * - * or it writes a value that is never read (and is not live-out - * because of an intermediate kill) or only - * within the same (compound) statement instance. - * In both cases, the access relation is also a subset of - * the group access relation. - * - * The cases where an uninitialized value is read or a value is written - * that is never read or where the dataflow occurs within a statement - * instance are also considered local and may also be removed. - * - * Essentially, we compute the intersection of "access" with either - * - * live_in union (range non-local-flow) - * - * or - * - * live_out union (domain non-local-flow) - * - * We first construct a relation "local" - * - * [[D -> R] -> [D' -> R']] - * - * of pairs of domain iterations accessing the reference group - * and references in the group that are coscheduled by "sched". - * - * If this relation does not intersect the dataflow dependences, - * then there is nothing we can possibly remove, unless the dataflow - * dependences themselves only relate a subset of the accesses. - * In particular, the accesses may not be involved in any dataflow - * dependences, either because they are uninitialized reads/dead writes - * or because the dataflow occurs inside a statement instance. - * - * Since the computation below may break up the access relation - * into smaller pieces, we only perform the intersection with - * the non-local dependent accesses if the local pairs - * intersect the dataflow dependences. Otherwise, we intersect - * with the universe of the non-local dependent accesses. - * This should at least remove accesses from statements that - * do not participate in any dependences. - * - * In particular, we remove the "local" dataflow dependences from - * the set of all dataflow dependences, or at least those - * that may contribute to a domain/range that intersects - * the domain of "access". - * Note that if the potential dataflow dependences are an overapproximation - * of the actual dataflow dependences, then the result remains an - * overapproximation of the non-local dataflow dependences. - * Copying to/from global memory is only needed for the references - * in the domain/range of the result or for accesses that are live out/in - * for the entire scop. - * - * We therefore map the domain/range of the "external" relation - * to the corresponding access relation and take the union with - * the live out/in relation. - */ -static __isl_give isl_union_map *remove_local_accesses( - struct gpu_prog *prog, __isl_take isl_union_map *tagged, - __isl_take isl_union_map *access, __isl_take isl_union_map *sched, - int read) -{ - int empty; - isl_union_pw_multi_aff *tagger; - isl_union_set *domain, *access_domain; - isl_union_map *local, *external, *universe; - isl_union_set *tag_set; - - if (isl_union_map_is_empty(access)) { - isl_union_map_free(sched); - isl_union_map_free(tagged); - return access; - } - - tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger); - domain = isl_union_map_domain(isl_union_map_copy(tagged)); - tagger = isl_union_pw_multi_aff_intersect_domain(tagger, - isl_union_set_copy(domain)); - sched = isl_union_map_preimage_domain_union_pw_multi_aff(sched, tagger); - - local = isl_union_map_apply_range(sched, - isl_union_map_reverse(isl_union_map_copy(sched))); - local = isl_union_map_intersect(local, - isl_union_map_copy(prog->scop->tagged_dep_flow)); - - empty = isl_union_map_is_empty(local); - - external = isl_union_map_copy(prog->scop->tagged_dep_flow); - universe = isl_union_map_universe(isl_union_map_copy(access)); - access_domain = isl_union_map_domain(universe); - domain = isl_union_set_universe(domain); - universe = isl_union_set_unwrap(domain); - universe = isl_union_map_intersect_domain(universe, access_domain); - domain = isl_union_map_wrap(universe); - if (read) - external = isl_union_map_intersect_range(external, domain); - else - external = isl_union_map_intersect_domain(external, domain); - external = isl_union_map_intersect_params(external, - isl_set_copy(prog->scop->context)); - external = isl_union_map_subtract(external, local); - - if (read) { - tag_set = isl_union_map_range(external); - external = wrapped_reference_to_access(tag_set, tagged); - external = isl_union_map_union(external, - isl_union_map_copy(prog->scop->live_in)); - } else { - tag_set = isl_union_map_domain(external); - external = wrapped_reference_to_access(tag_set, tagged); - external = isl_union_map_union(external, - isl_union_map_copy(prog->scop->live_out)); - } - - if (empty < 0) - external = isl_union_map_free(external); - else if (empty) - external = isl_union_map_universe(external); - - access = isl_union_map_intersect(access, external); - - return access; -} - -/* Given an access relation "access" from "group", remove those reads - * if ("read" is 1) or writes (if "read" is 0) that are only needed to - * communicate data within the same iteration of the schedule "prefix" - * at the position where the copying of the group is inserted. - * That is, the output dimension of "prefix" - * is equal to tile->depth. - * The domain of "prefix" corresponds to the original statement instances, - * i.e., those that appear in the domains of the access relations. - * - * Extract the tagged access relation of "group" and - * then call remove_local_accesses. - */ -static __isl_give isl_union_map *remove_local_accesses_group( - struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, - __isl_take isl_union_map *access, __isl_keep isl_union_map *prefix, - int read) -{ - isl_union_map *sched, *tagged; - - if (isl_union_map_is_empty(access)) - return access; - - tagged = group_tagged_access_relation(group); - sched = isl_union_map_copy(prefix); - - return remove_local_accesses(kernel->prog, tagged, access, sched, read); -} - -/* Build an access AST expression for the effective grid size using "build". - * Store the result in kernel->grid_size_expr. - */ -static isl_stat build_grid_size(struct ppcg_kernel *kernel, - __isl_keep isl_ast_build *build) -{ - isl_multi_pw_aff *size; - - size = isl_multi_pw_aff_copy(kernel->grid_size); - size = isl_multi_pw_aff_set_tuple_name(size, isl_dim_out, "grid"); - kernel->grid_size_expr = ppcg_build_size_expr(size, build); - - if (!kernel->grid_size_expr) - return isl_stat_error; - return isl_stat_ok; -} - -/* Build access AST expressions for the localized array sizes using "build". - * Store the result in local->bound_expr. - * Only do this for arrays for which localized bounds have been computed. - */ -static isl_stat build_local_array_sizes(struct ppcg_kernel *kernel, - __isl_keep isl_ast_build *build) -{ - int i; - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *local = &kernel->array[i]; - isl_multi_pw_aff *size; - - if (local->n_group == 0) - continue; - size = isl_multi_pw_aff_copy(local->bound); - local->bound_expr = ppcg_build_size_expr(size, build); - if (!local->bound_expr) - return isl_stat_error; - } - - return isl_stat_ok; -} - -/* Build access AST expressions for the effective grid size and - * the localized array sizes using "build". - */ -static isl_stat build_grid_and_local_array_sizes(struct ppcg_kernel *kernel, - __isl_keep isl_ast_build *build) -{ - if (build_grid_size(kernel, build) < 0) - return isl_stat_error; - if (build_local_array_sizes(kernel, build) < 0) - return isl_stat_error; - return isl_stat_ok; -} - -/* This function is called before the AST generator starts traversing - * the schedule subtree of a node with mark "mark". - * - * If the mark is called "kernel", store the kernel pointer in data->kernel - * for use in at_domain and build AST expressions for the grid size and - * the localized array sizes. - */ -static isl_stat before_mark(__isl_keep isl_id *mark, - __isl_keep isl_ast_build *build, void *user) -{ - struct ppcg_at_domain_data *data = user; - - if (!mark) - return isl_stat_error; - if (!strcmp(isl_id_get_name(mark), "kernel")) { - data->kernel = isl_id_get_user(mark); - if (build_grid_and_local_array_sizes(data->kernel, build) < 0) - return isl_stat_error; - } - return isl_stat_ok; -} - -/* This function is called after the AST generator has finished traversing - * the schedule subtree of a mark node. "node" points to the corresponding - * mark AST node. - * - * If the mark is called "kernel", then replace "node" by a user node - * that "calls" the kernel, representing the launch of the kernel. - * The original "node" is stored inside the kernel object so that - * it can be used to print the device code. - * Note that this assumes that a kernel is only launched once. - * Also clear data->kernel. - */ -static __isl_give isl_ast_node *after_mark(__isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build, void *user) -{ - isl_ctx *ctx; - isl_id *id; - isl_ast_expr *expr; - isl_ast_expr_list *list; - struct ppcg_kernel *kernel; - struct ppcg_at_domain_data *data = user; - - ctx = isl_ast_node_get_ctx(node); - id = isl_ast_node_mark_get_id(node); - if (!id) - return isl_ast_node_free(node); - if (strcmp(isl_id_get_name(id), "kernel") || !data->kernel) { - isl_id_free(id); - return node; - } - kernel = data->kernel; - data->kernel = NULL; - kernel->space = isl_ast_build_get_schedule_space(build); - kernel->tree = isl_ast_node_mark_get_node(node); - isl_ast_node_free(node); - - expr = isl_ast_expr_from_id(isl_id_copy(id)); - list = isl_ast_expr_list_alloc(ctx, 0); - expr = isl_ast_expr_call(expr, list); - node = isl_ast_node_alloc_user(expr); - node = isl_ast_node_set_annotation(node, id); - - return node; -} - -static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user) -{ - int *depth = user; - int node_depth; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf) - return isl_bool_true; - node_depth = isl_schedule_node_get_schedule_depth(node); - if (node_depth > *depth) - *depth = node_depth; - - return isl_bool_false; -} - -/* Use isl to generate code for both the host and the device - * from "schedule". - * The device code is marked by "kernel" mark nodes in the schedule tree, - * containing a pointer to a ppcg_kernel object. - * The returned AST only contains the AST for the host code. - * The ASTs for the device code are embedded in ppcg_kernel objects - * attached to the leaf nodes that call "kernel". - */ -__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, - __isl_take isl_schedule *schedule) -{ - struct ppcg_at_domain_data data; - isl_ast_build *build; - isl_ast_node *tree; - isl_id_list *iterators; - int depth; - - data.prog = gen->prog; - data.gen = gen; - data.kernel = NULL; - - depth = 0; - if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth, - &depth) < 0) - return NULL; - build = isl_ast_build_alloc(gen->prog->ctx); - iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c"); - build = isl_ast_build_set_iterators(build, iterators); - build = isl_ast_build_set_at_each_domain(build, &at_domain, &data); - build = isl_ast_build_set_before_each_mark(build, &before_mark, &data); - build = isl_ast_build_set_after_each_mark(build, &after_mark, &data); - if (gen->prog->scop->options->debug->dump_final_schedule) - isl_schedule_dump(schedule); - tree = isl_ast_build_node_from_schedule(build, schedule); - isl_ast_build_free(build); - - return tree; -} - -__isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str) -{ - if (!str) - return NULL; - return isl_union_map_read_from_str(ctx, str); -} - -/* Can "node" be tiled and then mapped to block and thread identifiers? - * That is, is it permutable with at least one coincident dimension? - */ -static int is_permutable(__isl_keep isl_schedule_node *node) -{ - if (!node) - return -1; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_band) - return 0; - if (!isl_schedule_node_band_get_permutable(node)) - return 0; - if (isl_schedule_node_band_n_member(node) < 1) - return 0; - if (!isl_schedule_node_band_member_get_coincident(node, 0)) - return 0; - - return 1; -} - -/* A isl_schedule_foreach_schedule_node_top_down callback - * for setting *any_permutable and aborting the search - * if "node" is a permutable band with coincident dimensions. - * Otherwise, continue searching. - */ -static isl_bool set_permutable(__isl_keep isl_schedule_node *node, void *user) -{ - int *any_permutable = user; - int permutable; - - permutable = is_permutable(node); - if (permutable < 0) - return isl_bool_error; - if (!permutable) - return isl_bool_true; - - *any_permutable = 1; - - return isl_bool_error; -} - -/* Does the subtree rooted at "node" have any suitably permutable band nodes? - * That is, does it have any nodes that are permutable and that - * have a least one coincident dimension? - */ -static int subtree_has_permutable_bands(__isl_keep isl_schedule_node *node) -{ - int any_parallelism = 0; - - if (isl_schedule_node_foreach_descendant_top_down(node, &set_permutable, - &any_parallelism) < 0 && - !any_parallelism) - return -1; - - return any_parallelism; -} - -/* Does "schedule" contain any permutable band with at least one coincident - * member? - */ -int has_any_permutable_node(__isl_keep isl_schedule *schedule) -{ - isl_schedule_node *root; - int any_permutable; - - root = isl_schedule_get_root(schedule); - any_permutable = subtree_has_permutable_bands(root); - isl_schedule_node_free(root); - - return any_permutable; -} - -/* Is "node" a candidate for mapping to block and thread identifiers? - * In particular, is it permutable with at least one coincident dimension? - * Alternatively, does the subtree rooted at "node" not contain - * any such permutable node? Filter nodes are skipped in this case, - * because a band node will be inserted in front of the returned - * node and this is not possible for filter nodes that are children - * of set or sequence nodes. - */ -static int is_candidate(__isl_keep isl_schedule_node *node) -{ - int permutable; - - if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf) - return 1; - permutable = is_permutable(node); - if (permutable < 0 || permutable) - return permutable; - if (isl_schedule_node_get_type(node) == isl_schedule_node_filter) - return 0; - permutable = subtree_has_permutable_bands(node); - if (permutable < 0) - return -1; - return !permutable; -} - -/* Is "node" the outermost node in its branch that can be tiled - * and then mapped to block and thread identifiers? - * If there are no such nodes in the subtree at "node" and - * if "node" is not a filter node, then it is accepted too. - */ -static int is_outer_tilable(__isl_keep isl_schedule_node *node) -{ - int tilable; - isl_schedule_node *ancestor; - - tilable = is_candidate(node); - if (tilable < 0) - return -1; - if (!tilable) - return 0; - - tilable = 0; - ancestor = isl_schedule_node_copy(node); - while (isl_schedule_node_has_parent(ancestor)) { - ancestor = isl_schedule_node_parent(ancestor); - - tilable = is_candidate(ancestor); - if (tilable < 0 || tilable) - break; - } - - isl_schedule_node_free(ancestor); - return tilable < 0 ? -1 : !tilable; -} - -/* Collect the references to all writes in "group". - * Each reference is represented by a universe set in a space - * - * [S[i,j] -> R[]] - * - * with S[i,j] the statement instance space and R[] the array reference. - */ -static __isl_give isl_union_set *group_tagged_writes( - struct gpu_array_ref_group *group) -{ - int i; - isl_space *space; - isl_union_set *writes; - - space = isl_map_get_space(group->access); - writes = isl_union_set_empty(space); - for (i = 0; i < group->n_ref; ++i) { - isl_space *space; - isl_set *writes_i; - - if (!group->refs[i]->write) - continue; - - space = isl_map_get_space(group->refs[i]->tagged_access); - space = isl_space_domain(space); - writes_i = isl_set_universe(space); - writes = isl_union_set_add_set(writes, writes_i); - } - - return writes; -} - -/* Is there any write access in "group" that requires synchronization - * on a write to global memory? - * We currently take into account all writes that would require - * synchronization at the thread level depth, but if the copying - * for this group is performed at an outer level, then we do not - * actually need to take into account dependences at intermediate levels. - */ -static int any_sync_writes_in_group(struct ppcg_kernel *kernel, - struct gpu_array_ref_group *group) -{ - isl_union_set *writes; - int empty, disjoint; - - empty = isl_union_set_is_empty(kernel->sync_writes); - if (empty < 0) - return -1; - if (empty) - return 0; - - writes = group_tagged_writes(group); - disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes); - isl_union_set_free(writes); - - return disjoint < 0 ? -1 : !disjoint; -} - -/* Collect the references to all writes in "kernel" that write directly - * to global or shared memory, i.e., that are not mapped to private memory. - * Each reference is represented by a universe set in a space - * - * [S[i,j] -> R[]] - * - * with S[i,j] the statement instance space and R[] the array reference. - */ -static __isl_give isl_union_set *collect_non_private_tagged_writes( - struct ppcg_kernel *kernel) -{ - isl_union_set *writes; - int i, j; - - writes = isl_union_set_empty(isl_union_set_get_space(kernel->arrays)); - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *array = &kernel->array[i]; - - for (j = 0; j < array->n_group; ++j) { - struct gpu_array_ref_group *group = array->groups[j]; - enum ppcg_group_access_type type; - isl_union_set *writes_ij; - - if (!group->write) - continue; - type = gpu_array_ref_group_type(group); - if (type == ppcg_access_private) - continue; - writes_ij = group_tagged_writes(group); - writes = isl_union_set_union(writes, writes_ij); - } - } - - return writes; -} - -/* Are there any direct writes to global memory that require - * synchronization? - */ -static int any_global_or_shared_sync_writes(struct ppcg_kernel *kernel) -{ - isl_union_set *writes; - int empty, disjoint; - - empty = isl_union_set_is_empty(kernel->sync_writes); - if (empty < 0) - return -1; - if (empty) - return 0; - - writes = collect_non_private_tagged_writes(kernel); - disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes); - isl_union_set_free(writes); - - return disjoint < 0 ? -1 : !disjoint; -} - -/* Construct an isl_multi_val for use as tile sizes for tiling "node" - * from the elements in "tile_size". - */ -static __isl_give isl_multi_val *construct_band_tiles_sizes( - __isl_keep isl_schedule_node *node, int *tile_size) -{ - isl_space *space; - - if (!node) - return NULL; - - space = isl_schedule_node_band_get_space(node); - return ppcg_multi_val_from_int_list(space, tile_size); -} - -/* Replace the partial schedule S of the band node "node" by - * - * floor(S/f) - * - * or - * - * f * floor(S/f) - * - * if scale_tile_loops is set, with f the integers in "factor". - * The list that "factor" points to is assumed to contain at least - * as many elements as the number of members in the band. - */ -static __isl_give isl_schedule_node *snap_band_to_sizes( - __isl_take isl_schedule_node *node, int *factor, - struct ppcg_options *options) -{ - isl_multi_val *mv; - - mv = construct_band_tiles_sizes(node, factor); - node = isl_schedule_node_band_scale_down(node, isl_multi_val_copy(mv)); - if (options->scale_tile_loops) - node = isl_schedule_node_band_scale(node, - isl_multi_val_copy(mv)); - isl_multi_val_free(mv); - - return node; -} - -/* Tile "band" with tile size specified by "sizes". - * - * Since the tile loops will be mapped to block ids, we forcibly - * turn off tile loop scaling. We may want to enable tile loop scaling - * at some later point, but then we would have to support the detection - * of strides during the mapping to block ids. - * Similarly, since the point loops will be mapped to thread ids, - * we forcibly shift the point loops so that they start at zero. - */ -static __isl_give isl_schedule_node *tile_band( - __isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes) -{ - isl_ctx *ctx = isl_schedule_node_get_ctx(node); - int scale_tile; - int shift_point; - - scale_tile = isl_options_get_tile_scale_tile_loops(ctx); - isl_options_set_tile_scale_tile_loops(ctx, 0); - shift_point = isl_options_get_tile_shift_point_loops(ctx); - isl_options_set_tile_shift_point_loops(ctx, 1); - - node = isl_schedule_node_band_tile(node, sizes); - - isl_options_set_tile_scale_tile_loops(ctx, scale_tile); - isl_options_set_tile_shift_point_loops(ctx, shift_point); - - return node; -} - -/* Extract the set of parameter values and outer schedule dimensions - * for which any statement instance - * in the kernel inserted at "node" needs to be executed. - * Intersect the set of parameter values derived from the host schedule - * relation with the context of "prog". - */ -static __isl_give isl_set *extract_context(__isl_keep isl_schedule_node *node, - struct gpu_prog *prog) -{ - isl_union_map *schedule; - isl_union_set *schedule_domain; - isl_set *context; - int empty; - - schedule = isl_schedule_node_get_prefix_schedule_relation(node); - schedule_domain = isl_union_map_range(schedule); - empty = isl_union_set_is_empty(schedule_domain); - if (empty < 0) { - isl_union_set_free(schedule_domain); - return NULL; - } - if (empty) { - int depth; - isl_space *space; - - space = isl_union_set_get_space(schedule_domain); - isl_union_set_free(schedule_domain); - space = isl_space_set_from_params(space); - depth = isl_schedule_node_get_schedule_depth(node); - space = isl_space_add_dims(space, isl_dim_set, depth); - context = isl_set_empty(space); - } else { - context = isl_set_from_union_set(schedule_domain); - } - context = isl_set_intersect_params(context, - isl_set_copy(prog->context)); - - return context; -} - -/* Return the set of outer array elements accessed by - * by the statement instances in "domain" in "prog". - * The instances in "domain" are those that appear - * in the domains of the access relations in "prog". - */ -static __isl_give isl_union_set *accessed_by_domain( - __isl_take isl_union_set *domain, struct gpu_prog *prog) -{ - isl_union_map *access; - isl_union_set *arrays; - - access = isl_union_map_union(isl_union_map_copy(prog->read), - isl_union_map_copy(prog->may_write)); - access = isl_union_map_intersect_domain(access, domain); - arrays = isl_union_map_range(access); - arrays = isl_union_set_apply(arrays, - isl_union_map_copy(prog->to_outer)); - - return arrays; -} - -/* Return the number of outer band members of the band node "node" - * that are marked coincident. - */ -static int n_outer_coincidence(__isl_keep isl_schedule_node *node) -{ - int i, n; - - n = isl_schedule_node_band_n_member(node); - - for (i = 0; i < n; ++i) - if (!isl_schedule_node_band_member_get_coincident(node, i)) - break; - - return i; -} - -/* If the band node "node" has more than "n" members, then split off - * the first "n" of them. - */ -static __isl_give isl_schedule_node *split_band( - __isl_take isl_schedule_node *node, int n) -{ - int dim; - - dim = isl_schedule_node_band_n_member(node); - if (n < dim) - node = isl_schedule_node_band_split(node, n); - - return node; -} - -/* Scale a band node that may have been split by split_band. - * "sizes" are the scaling factors for the original node. - * "node" either points to the original band node, or the outer - * of the two pieces after splitting. - * - * If the number of elements in "node" is smaller than the number of - * elements in "sizes", then some splitting has occurred and we split - * "sizes" in the same way. - */ -static __isl_give isl_schedule_node *scale_band( - __isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes) -{ - int n, dim; - - n = isl_multi_val_dim(sizes, isl_dim_set); - dim = isl_schedule_node_band_n_member(node); - if (n > dim) { - isl_multi_val *sizes2; - - sizes2 = isl_multi_val_copy(sizes); - sizes = isl_multi_val_drop_dims(sizes, - isl_dim_set, dim, n - dim); - sizes2 = isl_multi_val_drop_dims(sizes2, isl_dim_set, 0, dim); - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_band_scale(node, sizes2); - node = isl_schedule_node_parent(node); - } - - return isl_schedule_node_band_scale(node, sizes); -} - -/* Return an isl_multi_aff, with as elements the parameters in "space" - * that have the names specified by the elements in "names". - * If (some of) these parameters do not already appear in "space", - * then they are added first. - */ -static __isl_give isl_multi_aff *parameter_vector(__isl_take isl_space *space, - __isl_keep isl_id_list *names) -{ - int i, n; - isl_local_space *ls; - isl_multi_aff *ma; - - if (!names) - space = isl_space_free(space); - - n = isl_id_list_n_id(names); - for (i = 0; i < n; ++i) { - int pos; - isl_id *id; - - id = isl_id_list_get_id(names, i); - pos = isl_space_find_dim_by_id(space, isl_dim_param, id); - if (pos >= 0) { - isl_id_free(id); - continue; - } - pos = isl_space_dim(space, isl_dim_param); - space = isl_space_add_dims(space, isl_dim_param, 1); - space = isl_space_set_dim_id(space, isl_dim_param, pos, id); - } - ma = isl_multi_aff_zero(isl_space_copy(space)); - ls = isl_local_space_from_space(isl_space_domain(space)); - for (i = 0; i < n; ++i) { - int pos; - isl_id *id; - isl_aff *aff; - - id = isl_id_list_get_id(names, i); - pos = isl_space_find_dim_by_id(space, isl_dim_param, id); - isl_id_free(id); - aff = isl_aff_var_on_domain(isl_local_space_copy(ls), - isl_dim_param, pos); - ma = isl_multi_aff_set_aff(ma, i, aff); - } - isl_local_space_free(ls); - - return ma; -} - -/* Return constraints on the domain elements that equate a sequence of - * parameters called "names", to the partial schedule - * of "node" modulo the integers in "size". - * The number of elements in the array "size" should be equal - * to the number of elements in "names". - * The number of members of the band node "node" should be smaller - * than or equal to this number. If it is smaller, then the first - * elements of "names" are equated to zero. - */ -static __isl_give isl_union_set *set_schedule_modulo( - __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names, - int *size) -{ - int n, n_zero; - isl_space *space; - isl_multi_aff *ma; - isl_multi_union_pw_aff *mupa, *mupa2; - isl_multi_val *mv; - isl_union_set *domain; - - if (!node) - return NULL; - n = isl_id_list_n_id(names); - if (n == 0) - return isl_schedule_node_get_universe_domain(node); - n_zero = n - isl_schedule_node_band_n_member(node); - - mupa = isl_schedule_node_band_get_partial_schedule(node); - mv = construct_band_tiles_sizes(node, size + n_zero); - mupa = isl_multi_union_pw_aff_mod_multi_val(mupa, mv); - - space = isl_multi_union_pw_aff_get_space(mupa); - space = isl_space_params(space); - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, n_zero); - ma = isl_multi_aff_zero(space); - - domain = isl_schedule_node_get_universe_domain(node); - mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain( - isl_union_set_copy(domain), ma); - mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa); - - space = isl_multi_union_pw_aff_get_space(mupa); - ma = parameter_vector(space, names); - - mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma); - mupa = isl_multi_union_pw_aff_sub(mupa, mupa2); - - return isl_multi_union_pw_aff_zero_union_set(mupa); -} - -/* Insert a context node at "node" introducing the block and thread - * identifiers along with their bounds, which are stored in kernel->grid_size - * and kernel->block_dim. - * Note that the bounds on the block identifiers may implicitly impose - * constraints on the parameters. A guard needs to be inserted - * in the schedule tree to ensure that those bounds hold at "node". - * This guard is inserted in insert_guard. - */ -static __isl_give isl_schedule_node *insert_context(struct ppcg_kernel *kernel, - __isl_take isl_schedule_node *node) -{ - isl_set *context; - - context = isl_set_universe(isl_set_get_space(kernel->context)); - - context = add_bounded_parameters_dynamic(context, - kernel->grid_size, kernel->block_ids); - context = add_bounded_parameters(context, - kernel->block_dim, kernel->thread_ids); - - node = isl_schedule_node_insert_context(node, context); - - return node; -} - -/* Insert a guard that eliminates kernel launches where the kernel - * obviously does not have any work to do. - * - * In particular, eliminate kernel launches where there are obviously - * zero blocks. - * Use the same block size constraints that are used to create the context - * to ensure that all constraints implicit in the constructed context - * are imposed by the guard. - * - * Additionally, add other constraints that are valid - * for each executed instance ("context"), as long as this does not result - * in a disjunction. - */ -static __isl_give isl_schedule_node *insert_guard( - __isl_take isl_schedule_node *node, __isl_keep isl_set *context, - __isl_keep isl_multi_pw_aff *size, struct ppcg_scop *scop) -{ - unsigned nparam, n; - isl_set *guard; - isl_id_list *ids; - - guard = isl_set_copy(context); - guard = isl_set_compute_divs(guard); - guard = isl_set_from_basic_set(isl_set_simple_hull(guard)); - - nparam = isl_set_dim(guard, isl_dim_param); - n = isl_multi_pw_aff_dim(size, isl_dim_out); - ids = ppcg_scop_generate_names(scop, n, "__ppcg_tmp"); - guard = add_bounded_parameters_dynamic(guard, size, ids); - isl_id_list_free(ids); - guard = isl_set_project_out(guard, isl_dim_param, nparam, n); - - node = isl_schedule_node_insert_guard(node, guard); - - return node; -} - -/* Does any array reference group mapping require the band that is mapped - * to threads to be unrolled? - */ -static int kernel_requires_unroll(struct ppcg_kernel *kernel) -{ - int i, j; - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *array = &kernel->array[i]; - - for (j = 0; j < array->n_group; ++j) { - struct gpu_array_ref_group *group = array->groups[j]; - if (gpu_array_ref_group_requires_unroll(group)) - return 1; - } - } - - return 0; -} - -/* Mark the given band node "node" for unrolling by the AST generator and - * then sink it to the leaves of the schedule tree. - * All dimensions of "node" are assumed to be coincident, such that this - * sinking is a valid operation. - */ -static __isl_give isl_schedule_node *unroll(__isl_take isl_schedule_node *node) -{ - node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); - - node = isl_schedule_node_band_sink(node); - - return node; -} - -/* Insert a synchronization node in the schedule tree of "node" - * after the core computation of "kernel" at the level of the band - * that is mapped to threads, except if that level is equal to - * that of the band that is mapped to blocks or if there are no writes - * to global or shared memory in the core computation that require - * synchronization. - * If there are any writes to shared memory and the shared memory - * copying is performed at the same level, then synchronization - * is needed between the core and the copying anyway, so we might - * as well add it here. If the copying is performed at a higher - * level, then different iterations of intermediate schedule dimensions - * may have a different mapping from between shared memory elements and - * threads, such that synchronization is required after the core. - * "node" is assumed to point to the kernel node. - * - * If the shared and the thread mark point to the same node, then make - * sure the synchronization is inserted outside of the shared mark. - */ -static __isl_give isl_schedule_node *add_sync(struct ppcg_kernel *kernel, - __isl_take isl_schedule_node *node) -{ - int depth; - int need_sync; - - need_sync = any_global_or_shared_sync_writes(kernel); - if (need_sync < 0) - return isl_schedule_node_free(node); - if (!need_sync) - return node; - - node = gpu_tree_move_down_to_thread(node, kernel->core); - depth = isl_schedule_node_get_schedule_depth(node); - node = gpu_tree_move_up_to_kernel(node); - if (depth == isl_schedule_node_get_schedule_depth(node)) - return node; - - node = gpu_tree_move_down_to_depth(node, depth, kernel->core); - node = gpu_tree_ensure_following_sync(node, kernel); - - node = gpu_tree_move_up_to_kernel(node); - - return node; -} - -/* Return a read ("read" is 1) or write access relation for "group" - * with those accesses removed that are only needed to communicate data - * within the subtree of the schedule rooted at "node". - * Furthermore, include the prefix schedule at "node". - * That is, return a relation of the form - * - * S -> [D -> A] - * - * with D the outer schedule dimensions at "node". - */ -static __isl_give isl_union_map *anchored_non_local_accesses( - struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, - __isl_take isl_schedule_node *node, int read) -{ - isl_union_map *access; - isl_union_map *prefix; - - prefix = isl_schedule_node_get_prefix_schedule_relation(node); - prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, - isl_union_pw_multi_aff_copy(kernel->contraction)); - access = gpu_array_ref_group_access_relation(group, read, !read); - access = remove_local_accesses_group(kernel, group, access, prefix, - read); - access = isl_union_map_range_product(prefix, access); - - return access; -} - -/* Given an array reference group "group", create a mapping - * - * read[D -> A] -> [D -> A] - * - * if "read" is set or - * - * write[D -> A] -> [D -> A] - * - * if "read" is not set. - * D corresponds to the outer tile->depth dimensions of - * the kernel schedule. - */ -static __isl_give isl_multi_aff *create_from_access(isl_ctx *ctx, - struct gpu_array_ref_group *group, int read) -{ - struct gpu_array_tile *tile; - isl_space *space; - isl_id *id; - - tile = gpu_array_ref_group_tile(group); - space = isl_space_copy(group->array->space); - space = isl_space_from_range(space); - space = isl_space_add_dims(space, isl_dim_in, tile->depth); - space = isl_space_wrap(space); - space = isl_space_map_from_set(space); - - id = isl_id_alloc(ctx, read ? "read" : "write", group); - space = isl_space_set_tuple_id(space, isl_dim_in, id); - - return isl_multi_aff_identity(space); -} - -/* If any writes in "group" require synchronization, then make sure - * that there is a synchronization node for "kernel" after the node - * following "node" in a sequence. - * - * If "shared" is set and no synchronization is needed for - * the writes to global memory, then add synchronization before - * the kernel to protect shared memory from being overwritten - * by the next iteration of the core computation. - * No additional synchronization is needed to protect against - * the next copy into shared memory because each element of - * the shared memory tile is always copied by the same thread. - */ -static __isl_give isl_schedule_node *add_group_write_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel, - struct gpu_array_ref_group *group, int shared) -{ - int need_sync; - - need_sync = any_sync_writes_in_group(kernel, group); - if (need_sync < 0) - return isl_schedule_node_free(node); - if (need_sync) { - node = isl_schedule_node_parent(node); - node = isl_schedule_node_next_sibling(node); - node = isl_schedule_node_child(node, 0); - node = gpu_tree_ensure_following_sync(node, kernel); - } else if (shared) { - struct gpu_array_tile *tile; - - tile = gpu_array_ref_group_tile(group); - node = isl_schedule_node_parent(node); - node = isl_schedule_node_parent(node); - node = gpu_tree_move_down_to_depth(node, tile->depth, - kernel->core); - node = gpu_tree_move_left_to_sync(node, kernel); - } - - return node; -} - -/* Add copy statements to the schedule tree of "node" - * for reading from global memory to private memory (if "read" is set) or - * for writing back from private memory to global memory - * (if "read" is not set) for the array reference group "group" that - * is mapped to private memory. - * On input, "node" points to the kernel node, and it is moved - * back there on output. - * - * The copies are performed in the order of the array elements. - * The copy statement instances include a reference to the outer - * tile->depth dimensions of the kernel schedule for ease of - * combining them with the group tiling. - * - * That is, the extra schedule is of the form - * - * type[D -> A] -> A - * - * where D corresponds to the outer tile->depth dimensions of - * the kernel schedule and A to the global array. - * This schedule is unrolled because registers are not addressable. - * - * The copying is inserted in the schedule tree through an extension - * of the form - * - * D -> type[D -> A] - * - * where the extra domain elements type[D -> A] are those accessed - * by the group. - * A filter is inserted on type[D -> A] to ensure that the element - * is read/written by the same thread that needs the element. - * This filter is obtained by applying - * - * S -> type[D -> A] - * - * to the thread filter for the core statements. - * - * The extension is inserted before the core computation in case of a read - * and after the core computation in case of a write. - * In the latter case, we also make sure that there is a synchronization - * node after the write to global memory, unless this write is performed - * at the outer level of the kernel. - * In principle, this synchronization could be inserted higher - * in the schedule tree depending on where the corresponding reads - * from global memory are performed. - */ -static __isl_give isl_schedule_node *add_copies_group_private( - struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, - __isl_take isl_schedule_node *node, int read) -{ - struct gpu_array_tile *tile; - isl_union_map *access; - isl_union_set *domain; - isl_space *space; - isl_multi_aff *from_access; - isl_multi_pw_aff *mpa; - isl_multi_union_pw_aff *mupa; - isl_union_pw_multi_aff *contraction; - isl_schedule_node *graft; - isl_union_set *filter; - int kernel_depth; - int empty; - - kernel_depth = isl_schedule_node_get_schedule_depth(node); - tile = gpu_array_ref_group_tile(group); - node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core); - - access = anchored_non_local_accesses(kernel, group, node, read); - empty = isl_union_map_is_empty(access); - if (empty < 0 || empty) { - isl_union_map_free(access); - if (empty < 0) - return isl_schedule_node_free(node); - return gpu_tree_move_up_to_kernel(node); - } - - group->array->global = 1; - group->local_array->global = 1; - - from_access = create_from_access(kernel->ctx, group, read); - space = isl_space_domain(isl_multi_aff_get_space(from_access)); - access = isl_union_map_preimage_range_multi_aff(access, from_access); - - filter = isl_union_set_copy(kernel->thread_filter); - contraction = isl_union_pw_multi_aff_copy(kernel->contraction); - filter = isl_union_set_preimage_union_pw_multi_aff(filter, contraction); - filter = isl_union_set_apply(filter, isl_union_map_copy(access)); - filter = isl_union_set_detect_equalities(filter); - filter = isl_union_set_coalesce(filter); - - domain = isl_union_map_range(access); - access = isl_union_set_wrapped_domain_map(domain); - access = isl_union_map_reverse(access); - access = isl_union_map_coalesce(access); - graft = isl_schedule_node_from_extension(access); - - space = isl_space_map_from_set(space); - mpa = isl_multi_pw_aff_identity(space); - mpa = isl_multi_pw_aff_range_factor_range(mpa); - mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa); - - graft = isl_schedule_node_child(graft, 0); - graft = isl_schedule_node_insert_partial_schedule(graft, mupa); - graft = unroll(graft); - - graft = isl_schedule_node_insert_filter(graft, filter); - - graft = isl_schedule_node_parent(graft); - - if (read) - node = isl_schedule_node_graft_before(node, graft); - else { - node = isl_schedule_node_graft_after(node, graft); - if (kernel_depth < tile->depth) - node = add_group_write_sync(node, kernel, group, 0); - } - - node = gpu_tree_move_up_to_kernel(node); - - return node; -} - -/* Add copy statements to the schedule tree of "node" - * for reading from global memory to shared memory (if "read" is set) or - * for writing back from shared memory to global memory - * (if "read" is not set) for the array reference group "group" that - * is mapped to shared memory. - * On input, "node" points to the kernel node, and it is moved - * back there on output. - * - * The copies are performed in the order of the corresponding shared - * memory tile. - * The copy statement instances include a reference to the outer - * tile->depth dimensions of the kernel schedule for ease of - * combining them with the group tiling. - * - * If we are performing a read from global memory to shared memory and - * if the array involved is not a scalar, then we copy - * the entire tile to shared memory. This may result in some extra - * elements getting copied, but it should lead to simpler code - * (which means that fewer registers may be needed) and less divergence. - * - * Otherwise, we only copy the elements that will be read or have been written - * in the kernel. - * - * That is, the extra schedule is of the form - * - * type[D -> A] -> T - * - * where D corresponds to the outer tile->depth dimensions of - * the kernel schedule, A to the global array and T is the corresponding - * shared memory tile. - * - * The copying is inserted in the schedule tree through an extension - * of the form - * - * D -> type[D -> A] - * - * where the extra domain elements type[D -> A] are those accessed - * by the group. In the case of read from a non-scalar, this set - * is replaced by the entire shared memory tile. - * - * If the "unroll_copy_shared" option is set, then the AST generator - * is instructed to unroll the copying code. - * - * A filter is inserted on type[D -> A] to map the copy instances - * to the threads. In particular, the thread identifiers are - * equated to the position inside the shared memory tile (T) - * modulo the block size. - * We try to align the innermost tile dimension with the innermost - * thread identifier (x) as a heuristic to improve coalescing. - * In particular, if the dimension of the tile is greater than - * the dimension of the block, then the schedule mapping to the tile - * is broken up into two pieces and the filter is applied to the inner part. - * If, on the other hand, the dimension of the tile is smaller than - * the dimension of the block, then the initial thread identifiers - * are equated to zero and the remaining thread identifiers are - * matched to the memory tile. - * - * The extension is inserted before the core computation in case of a read - * and after the core computation in case of a write. - * In the case of a read, we first need to make sure there is some - * synchronization before the core computation such that we can put the read - * from global memory to shared memory before that synchronization. - * This ensures that all threads have finished copying into shared memory - * before the shared memory is used. - * We also need to make sure that there is a synchronization node after - * the core computation to ensure that the next load into shared memory - * only happens after all data has been used. There is no need for - * this synchronization if we are at the outer level since then there - * won't be a next load. - * In the case of a write, we need to make sure there is some synchronization - * after the core computation such taht we can put the write from shared - * memory to global memory after that synchronization. - * Unless we are at the outer level, we also need a synchronization node - * after the write to ensure the data is saved to global memory - * before the next iteration write to the same shared memory. - * It also makes sure the data has arrived in global memory before - * it is read in a subsequent iteration. - */ -static __isl_give isl_schedule_node *add_copies_group_shared( - struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, - __isl_take isl_schedule_node *node, int read) -{ - struct gpu_array_tile *tile; - isl_union_map *access; - isl_union_set *domain; - isl_multi_aff *ma; - isl_multi_aff *from_access; - isl_multi_pw_aff *mpa; - isl_multi_union_pw_aff *mupa; - isl_schedule_node *graft; - isl_union_set *filter; - int skip; - int kernel_depth; - int empty; - - tile = gpu_array_ref_group_tile(group); - kernel_depth = isl_schedule_node_get_schedule_depth(node); - node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core); - - access = anchored_non_local_accesses(kernel, group, node, read); - empty = isl_union_map_is_empty(access); - if (empty < 0 || empty) { - isl_union_map_free(access); - if (empty < 0) - return isl_schedule_node_free(node); - return gpu_tree_move_up_to_kernel(node); - } - - group->array->global = 1; - group->local_array->global = 1; - - from_access = create_from_access(kernel->ctx, group, read); - - ma = isl_multi_aff_copy(tile->tiling); - ma = isl_multi_aff_pullback_multi_aff(ma, - isl_multi_aff_copy(from_access)); - mpa = isl_multi_pw_aff_from_multi_aff(ma); - mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa); - - domain = isl_union_map_range(access); - - if (read && !gpu_array_is_scalar(group->array)) { - isl_map *map; - isl_union_set_free(domain); - map = group_tile(group); - domain = isl_union_set_from_set(isl_map_wrap(map)); - } - - domain = isl_union_set_preimage_multi_aff(domain, from_access); - access = isl_union_set_wrapped_domain_map(domain); - access = isl_union_map_reverse(access); - access = isl_union_map_coalesce(access); - graft = isl_schedule_node_from_extension(access); - - graft = isl_schedule_node_child(graft, 0); - - graft = isl_schedule_node_insert_partial_schedule(graft, mupa); - if (kernel->options->unroll_copy_shared) - graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_unroll); - - if (tile->n > kernel->n_block && kernel->n_block > 0) { - graft = isl_schedule_node_band_split(graft, - tile->n - kernel->n_block); - graft = isl_schedule_node_child(graft, 0); - } - if (tile->n < kernel->n_block) - skip = kernel->n_block - tile->n; - else - skip = 0; - filter = set_schedule_modulo(graft, kernel->thread_ids, - kernel->block_dim); - if (!kernel->options->wrap) - graft = snap_band_to_sizes(graft, kernel->block_dim + skip, - kernel->options); - if (tile->n > kernel->n_block && kernel->n_block > 0) - graft = isl_schedule_node_parent(graft); - graft = isl_schedule_node_insert_filter(graft, filter); - - while (graft && isl_schedule_node_has_parent(graft)) - graft = isl_schedule_node_parent(graft); - - if (read) { - if (kernel_depth < tile->depth) - node = gpu_tree_ensure_sync_after_core(node, kernel); - node = gpu_tree_move_left_to_sync(node, kernel); - node = isl_schedule_node_graft_before(node, graft); - } else { - node = gpu_tree_move_right_to_sync(node, kernel); - node = isl_schedule_node_graft_after(node, graft); - if (kernel_depth < tile->depth) - node = add_group_write_sync(node, kernel, group, 1); - } - - node = gpu_tree_move_up_to_kernel(node); - - return node; -} - -/* Check whether the array reference group "group" is mapped to - * private or shared memory and, if so, - * add copy statements to the schedule tree of "node" - * for reading from global memory to private or shared memory - * (if "read" is set) or for writing back from private or shared memory - * to global memory (if "read" is not set) for this group. - * On input, "node" points to the kernel node, and it is moved - * back there on output. - */ -static __isl_give isl_schedule_node *add_copies_group( - struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, - __isl_take isl_schedule_node *node, int read) -{ - enum ppcg_group_access_type type; - - type = gpu_array_ref_group_type(group); - if (type == ppcg_access_private) - return add_copies_group_private(kernel, group, node, read); - if (type == ppcg_access_shared) - return add_copies_group_shared(kernel, group, node, read); - return node; -} - -/* For each array reference group that is mapped to private or shared memory, - * add copy statements to the schedule tree of "node" - * for reading from global memory to private or shared memory - * and for writing back. - * On input, "node" points to the kernel node, and it is moved - * back there on output. - */ -static __isl_give isl_schedule_node *add_copies(struct ppcg_kernel *kernel, - __isl_take isl_schedule_node *node) -{ - int i, j; - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *array = &kernel->array[i]; - - for (j = 0; j < array->n_group; ++j) { - struct gpu_array_ref_group *group = array->groups[j]; - - node = add_copies_group(kernel, group, node, 1); - if (!node) - return NULL; - node = add_copies_group(kernel, group, node, 0); - if (!node) - return NULL; - } - } - - return node; -} - -/* Mark all dimensions in the current band node atomic. - */ -static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node) -{ - return ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); -} - -/* Mark "node" atomic, if it is a band node. - * Do the same for all ancestors. - * Return a pointer to "node" (in the updated schedule tree). - */ -static __isl_give isl_schedule_node *atomic_ancestors( - __isl_take isl_schedule_node *node) -{ - int pos; - - if (!node) - return NULL; - if (!isl_schedule_node_has_parent(node)) - return node; - - pos = isl_schedule_node_get_child_position(node); - node = isl_schedule_node_parent(node); - if (isl_schedule_node_get_type(node) == isl_schedule_node_band) - node = atomic(node); - node = atomic_ancestors(node); - node = isl_schedule_node_child(node, pos); - - return node; -} - -/* Collect all write references that require synchronization. - * "node" is assumed to point to the kernel node. - * Each reference is represented by a universe set in a space - * - * [S[i,j] -> R[]] - * - * with S[i,j] the statement instance space and R[] the array reference. - * - * This function should be called before block and thread filters are added. - * - * Synchronization is needed after a write if there is a subsequent read - * within the same block that may not be performed by the same thread. - * There should not be any dependences between different blocks, - * so we start with the flow dependences within the same kernel invocation - * and we subtract from these those dependences that are mapped - * to the same iteration of the bands where synchronization is inserted. - * We do not remove pairs of instances that are known to map to - * the same thread across different iterations of the intermediate - * bands because the read may be performed by a different thread - * than the one that needs the value if shared memory is involved. - * - * We also consider all pairs of possible writes that access the same - * memory location and that may be mapped to the same block but not - * to the same iteration of the intermediate bands. - * In theory, it would be possible for one thread to still be in - * a previous iteration of a loop in these bands. - * A write to global memory in this delayed thread could then overwrite - * a write from another thread that has already moved on to - * the next iteration. - * - * After computing the above writes paired off with reads or writes - * that depend on them, we project onto the domain writes. - * Sychronization is needed after writes to global memory - * through these references. - */ -static __isl_give isl_union_set *compute_sync_writes( - struct ppcg_kernel *kernel, __isl_keep isl_schedule_node *node) -{ - isl_union_map *local; - isl_union_map *may_writes, *shared_access; - isl_union_map *kernel_prefix, *thread_prefix; - isl_union_map *equal; - isl_union_set *wrap; - isl_union_set *domain; - isl_union_pw_multi_aff *contraction; - - kernel_prefix = isl_schedule_node_get_prefix_schedule_union_map(node); - node = isl_schedule_node_copy(node); - node = gpu_tree_move_down_to_thread(node, kernel->core); - thread_prefix = isl_schedule_node_get_prefix_schedule_union_map(node); - isl_schedule_node_free(node); - - contraction = kernel->contraction; - kernel_prefix = isl_union_map_preimage_domain_union_pw_multi_aff( - kernel_prefix, isl_union_pw_multi_aff_copy(contraction)); - thread_prefix = isl_union_map_preimage_domain_union_pw_multi_aff( - thread_prefix, isl_union_pw_multi_aff_copy(contraction)); - domain = isl_union_set_copy(kernel->expanded_domain); - domain = isl_union_set_universe(domain); - - may_writes = isl_union_map_copy(kernel->prog->scop->tagged_may_writes); - may_writes = isl_union_map_curry(may_writes); - may_writes = isl_union_map_intersect_domain(may_writes, domain); - may_writes = isl_union_map_uncurry(may_writes); - shared_access = isl_union_map_copy(may_writes); - shared_access = isl_union_map_apply_range(shared_access, - isl_union_map_reverse(may_writes)); - - local = isl_union_map_copy(kernel->prog->scop->tagged_dep_flow); - local = isl_union_map_union(local, shared_access); - local = isl_union_map_zip(local); - - equal = isl_union_map_apply_range(kernel_prefix, - isl_union_map_reverse(isl_union_map_copy(kernel_prefix))); - wrap = isl_union_map_wrap(equal); - local = isl_union_map_intersect_domain(local, wrap); - equal = isl_union_map_apply_range(thread_prefix, - isl_union_map_reverse(isl_union_map_copy(thread_prefix))); - wrap = isl_union_map_wrap(equal); - local = isl_union_map_subtract_domain(local, wrap); - - local = isl_union_map_zip(local); - local = isl_union_map_universe(local); - - return isl_union_map_domain(local); -} - -/* Group the domain elements into a single space, named kernelX, - * with X the kernel sequence number "kernel_id". - */ -static __isl_give isl_schedule_node *group_statements( - __isl_take isl_schedule_node *node, int kernel_id) -{ - char buffer[20]; - isl_id *id; - - if (!node) - return NULL; - - snprintf(buffer, sizeof(buffer), "kernel%d", kernel_id); - id = isl_id_alloc(isl_schedule_node_get_ctx(node), buffer, NULL); - return isl_schedule_node_group(node, id); -} - -/* Create a ppcg_kernel representing the domain instances that reach "node" - * and insert a mark node pointing to the ppcg_kernel before "node". - * The band that "node" points to is the band that needs to be mapped - * to block identifiers. The band that needs to be mapped to thread - * identifiers should be marked by a "thread" mark by the caller. - * The linear branch between the current node and the "thread" mark - * may also have a "shared" mark. If present, the mapping to shared - * memory is computed at that point. - * Both marks are removed by this function. - * If "scale" is set, then the band that "node" points to is scaled - * by "sizes". - * - * Mark all outer band nodes as atomic to ensure each kernel is only - * scheduled once. - * If the domain elements that reach "node" live in more than one space, - * then group the domain elements into a single space, named kernelX, - * with X the kernel sequence number. - * - * Insert a guard node governing the kernel node to ensure that - * no kernels with zero blocks are launched. - * - * Insert a context node describing the block and thread - * identifiers inside the kernel mark. - * The context node needs to be inserted after the effective block size - * has been determined such that the bounds on the thread identifiers - * would reflect the effective block size. - * Insert a filter node inside the context node mapping the statement - * instances to block identifiers. In particular, the block identifiers - * are equated to the partial schedule of band that was marked for mapping - * to blocks modulo the grid size. - * Insert a filter node inside the "thread" mark mapping the statement - * instances to thread identifiers. In particular, the thread identifiers - * are equated to the partial schedule of band that was marked for mapping - * to threads modulo the block size. - * - * Compute array reference groups for all arrays, set the local - * array bounds based on the set of domain instances that reach - * the kernel node, check the total amount of shared memory used - * and compute all group tilings. - * The array reference groups are computed after the block filter - * has been inserted because it affects the mapping to shared or - * private memory. This computation also requires the thread filter - * (in the ppcg_kernel object), but this thread filter should not - * have been added to the schedule tree yet since the computation - * requires the schedule of the band that needs to be mapped to - * threads before the privatization is applied. - * - * If any array reference group requires the band mapped to threads - * to be unrolled, then we perform the required unrolling. - * - * We save a copy of the schedule that may influence the mappings - * to shared or private memory in kernel->copy_schedule. - * - * Finally, we add synchronization and copy statements to the schedule tree, - * remove the "thread" mark and create representations for the local - * variables in the kernel. - * - * We keep a copy of the isl_id that points to the kernel to ensure - * that the kernel does not get destroyed if the schedule node - * is freed due to some error condition. - */ -__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen, - __isl_take isl_schedule_node *node, int scale, - __isl_keep isl_multi_val *sizes) -{ - struct ppcg_kernel *kernel; - isl_id *id; - isl_schedule_node *node_thread; - isl_union_map *host_schedule; - isl_union_pw_multi_aff *contraction; - isl_set *host_domain; - isl_union_set *domain, *expanded; - int single_statement; - - node = gpu_tree_insert_shared_before_thread(node); - if (!node) - return NULL; - - kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel); - kernel = ppcg_kernel_create_local_arrays(kernel, gen->prog); - if (!kernel) - return isl_schedule_node_free(node); - - domain = isl_schedule_node_get_domain(node); - single_statement = isl_union_set_n_set(domain) == 1; - - kernel->ctx = gen->ctx; - kernel->prog = gen->prog; - kernel->options = gen->options; - kernel->context = extract_context(node, gen->prog); - kernel->core = isl_union_set_universe(isl_union_set_copy(domain)); - contraction = isl_schedule_node_get_subtree_contraction(node); - kernel->contraction = isl_union_pw_multi_aff_copy(contraction); - expanded = isl_union_set_copy(domain); - expanded = isl_union_set_preimage_union_pw_multi_aff(expanded, - contraction); - kernel->expanded_domain = isl_union_set_copy(expanded); - kernel->arrays = accessed_by_domain(expanded, gen->prog); - kernel->n_grid = n_outer_coincidence(node); - node_thread = isl_schedule_node_copy(node); - node_thread = gpu_tree_move_down_to_thread(node_thread, kernel->core); - node_thread = isl_schedule_node_child(node_thread, 0); - kernel->n_block = n_outer_coincidence(node_thread); - isl_schedule_node_free(node_thread); - kernel->id = gen->kernel_id++; - read_grid_and_block_sizes(kernel, gen); - - kernel->sync_writes = compute_sync_writes(kernel, node); - - host_schedule = isl_schedule_node_get_prefix_schedule_union_map(node); - host_domain = isl_set_from_union_set(isl_union_map_range( - host_schedule)); - - node = atomic_ancestors(node); - - id = isl_id_alloc(gen->ctx, "kernel", kernel); - id = isl_id_set_free_user(id, &ppcg_kernel_free_wrap); - node = isl_schedule_node_insert_mark(node, isl_id_copy(id)); - - if (!single_statement) - node = group_statements(node, kernel->id); - - node = isl_schedule_node_child(node, 0); - node = split_band(node, kernel->n_grid); - kernel->block_ids = ppcg_scop_generate_names(gen->prog->scop, - kernel->n_grid, "b"); - kernel->block_filter = set_schedule_modulo(node, kernel->block_ids, - kernel->grid_dim); - kernel->grid_size = extract_grid_size(kernel, - isl_union_set_copy(domain)); - if (!kernel->options->wrap) - node = snap_band_to_sizes(node, kernel->grid_dim, - kernel->options); - if (scale) - node = scale_band(node, isl_multi_val_copy(sizes)); - node = isl_schedule_node_parent(node); - if (!single_statement) - node = isl_schedule_node_parent(node); - node = insert_guard(node, kernel->context, kernel->grid_size, - gen->prog->scop); - node = gpu_tree_move_down_to_thread(node, kernel->core); - node = isl_schedule_node_child(node, 0); - node = split_band(node, kernel->n_block); - kernel->thread_ids = ppcg_scop_generate_names(gen->prog->scop, - kernel->n_block, "t"); - kernel->thread_filter = set_schedule_modulo(node, kernel->thread_ids, - kernel->block_dim); - if (extract_block_size(kernel, domain) < 0) - node = isl_schedule_node_free(node); - - node = gpu_tree_move_up_to_kernel(node); - node = isl_schedule_node_child(node, 0); - node = insert_context(kernel, node); - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_insert_filter(node, - isl_union_set_copy(kernel->block_filter)); - - node = gpu_tree_move_up_to_kernel(node); - - if (gpu_group_references(kernel, node) < 0) - node = isl_schedule_node_free(node); - localize_bounds(kernel, host_domain); - isl_set_free(host_domain); - - check_shared_memory_bound(kernel); - mark_global_arrays(kernel); - compute_group_tilings(kernel); - - node = gpu_tree_move_down_to_thread(node, kernel->core); - node = isl_schedule_node_child(node, 0); - if (!kernel->options->wrap) - node = snap_band_to_sizes(node, kernel->block_dim, - kernel->options); - node = isl_schedule_node_insert_filter(node, - isl_union_set_copy(kernel->thread_filter)); - if (kernel_requires_unroll(kernel)) { - node = isl_schedule_node_child(node, 0); - node = unroll(node); - } - - node = gpu_tree_move_up_to_thread(node); - kernel->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node); - kernel->copy_schedule = - isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node); - contraction = isl_union_pw_multi_aff_copy(kernel->contraction); - kernel->copy_schedule = - isl_union_pw_multi_aff_pullback_union_pw_multi_aff( - kernel->copy_schedule, contraction); - - node = gpu_tree_move_up_to_kernel(node); - - node = add_sync(kernel, node); - node = add_copies(kernel, node); - - node = gpu_tree_move_down_to_shared(node, kernel->core); - node = isl_schedule_node_delete(node); - - node = gpu_tree_move_down_to_thread(node, kernel->core); - node = isl_schedule_node_delete(node); - - node = gpu_tree_move_up_to_kernel(node); - - if (create_kernel_vars(kernel) < 0) - node = isl_schedule_node_free(node); - - if (!single_statement) - node = isl_schedule_node_parent(node); - node = isl_schedule_node_parent(node); - - isl_id_free(id); - return node; -} - -/* Insert a zero-dimensional permutable band at "node". - */ -static __isl_give isl_schedule_node *insert_empty_permutable_band( - __isl_take isl_schedule_node *node) -{ - isl_space *space; - isl_schedule *schedule; - isl_union_set *domain; - isl_multi_union_pw_aff *mupa; - - schedule = isl_schedule_node_get_schedule(node); - domain = isl_schedule_get_domain(schedule); - space = isl_union_set_get_space(domain); - isl_union_set_free(domain); - isl_schedule_free(schedule); - - space = isl_space_set_from_params(space); - mupa = isl_multi_union_pw_aff_zero(space); - node = isl_schedule_node_insert_partial_schedule(node, mupa); - node = isl_schedule_node_band_set_permutable(node, 1); - - return node; -} - -/* See if hybrid tiling can be performed on "node" and its parent. - * If so, apply hybrid tiling and return the updated schedule tree. - * If not, return the original schedule tree. - * Return NULL on error. - * - * First check if "node", together with its parent, meets - * the basic requirements for hybrid tiling. - * If so, compute the relative dependence distances of "node" - * with respect to its parent and check if they are sufficiently bounded. - * If so, apply hybrid tiling using user specified tile sizes. - * - * The tile sizes are read before the dependence distance bounds are - * computed, because the user may have specified fewer dimensions - * than are available. In this case, the remaining schedule dimensions - * are split off and the dependence distances should be computed - * after these dimensions have been split off. - */ -static __isl_give isl_schedule_node *try_hybrid_tile(struct gpu_gen *gen, - __isl_take isl_schedule_node *node) -{ - int tile_len; - int *tile_size; - isl_bool ok; - isl_schedule_node *orig = node; - ppcg_ht_bounds *bounds; - - ok = ppcg_ht_parent_has_input_pattern(node); - if (ok < 0) - return isl_schedule_node_free(node); - if (!ok) - return orig; - - tile_len = 1 + isl_schedule_node_band_n_member(node); - tile_size = read_tile_sizes(gen, &tile_len); - if (!tile_size) - return isl_schedule_node_free(node); - - node = isl_schedule_node_copy(node); - node = split_band(node, tile_len - 1); - node = isl_schedule_node_parent(node); - bounds = ppcg_ht_compute_bounds(gen->prog->scop, node); - node = isl_schedule_node_child(node, 0); - - ok = ppcg_ht_bounds_is_valid(bounds); - if (ok >= 0 && ok) - node = gpu_hybrid_tile(gen, node, bounds, tile_size); - else - ppcg_ht_bounds_free(bounds); - free(tile_size); - - if (ok >= 0 && !ok) { - isl_schedule_node_free(node); - return orig; - } - isl_schedule_node_free(orig); - if (ok < 0) - return isl_schedule_node_free(node); - return node; -} - -/* If "node" is the outermost permutable band that can be mapped to block and - * thread identifiers in its branch (or the root of a subtree with - * no such outer bands), - * then mark the band as such, attaching a ppcg_kernel to the mark. - * - * If hybrid tiling is allowed, then first try and apply it - * to "node" and its parent. - * - * If "node" is the root of a subtree without permutable bands, - * then insert a zero-dimensional permutable band such that - * we can assume that "node" always points to a band node. - * This includes the case where "node" already points to a band node, - * but one without any coincident dimension. In this case, - * the extra node ensures that this original node does not get tiled. - * - * Tile "node" using user specified tile sizes, after splitting the band - * if the number of specified tile sizes is smaller than the dimension - * of the band. Mark the point band of this tiling as the band that - * needs to be mapped to threads and instruct the AST generator to unroll - * the band if the "unroll_gpu_tile" option is set. - * Create a kernel representing the domain instances that reach "node" and - * insert a mark node pointing to the ppcg_kernel before the band node. - */ -static __isl_give isl_schedule_node *mark_outer_permutable( - __isl_take isl_schedule_node *node, void *user) -{ - struct gpu_gen *gen = user; - int outer; - int scale; - int tile_len; - int *tile_size; - isl_id *id; - isl_multi_val *sizes; - - outer = is_outer_tilable(node); - if (outer < 0) - return isl_schedule_node_free(node); - if (!outer) - return node; - - if (gen->options->hybrid) { - isl_schedule_node *saved = isl_schedule_node_copy(node); - node = try_hybrid_tile(gen, node); - isl_schedule_node_free(saved); - if (node != saved) - return node; - } - - if (isl_schedule_node_get_type(node) != isl_schedule_node_band || - !isl_schedule_node_band_member_get_coincident(node, 0)) - node = insert_empty_permutable_band(node); - - tile_len = isl_schedule_node_band_n_member(node); - tile_size = read_tile_sizes(gen, &tile_len); - if (!tile_size) - return isl_schedule_node_free(node); - if (tile_len < isl_schedule_node_band_n_member(node)) - node = isl_schedule_node_band_split(node, tile_len); - sizes = construct_band_tiles_sizes(node, tile_size); - node = tile_band(node, isl_multi_val_copy(sizes)); - node = isl_schedule_node_child(node, 0); - if (gen->options->unroll_gpu_tile) - node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); - id = isl_id_alloc(gen->ctx, "thread", NULL); - node = isl_schedule_node_insert_mark(node, id); - node = isl_schedule_node_parent(node); - - scale = gen->options->scale_tile_loops; - node = gpu_create_kernel(gen, node, scale, sizes); - isl_multi_val_free(sizes); - free(tile_size); - - return node; -} - -/* Given a set or sequence node, return the union the filters of either all - * (if "only_initial" is not set) or the initial (if "only_initial" is set) - * direct subtrees that do not contain any suitably permutable bands - * (according to subtree_has_permutable_bands). - */ -static __isl_give isl_union_set *get_non_parallel_subtree_filters( - __isl_keep isl_schedule_node *node, int only_initial) -{ - isl_space *space; - isl_union_set *filter; - int i, n; - - n = isl_schedule_node_n_children(node); - if (n < 0) - return NULL; - - node = isl_schedule_node_copy(node); - node = isl_schedule_node_child(node, 0); - filter = isl_schedule_node_filter_get_filter(node); - node = isl_schedule_node_parent(node); - space = isl_union_set_get_space(filter); - isl_union_set_free(filter); - filter = isl_union_set_empty(space); - - for (i = 0; i < n; ++i) { - int parallelism; - - node = isl_schedule_node_child(node, i); - parallelism = subtree_has_permutable_bands(node); - if (parallelism < 0) { - filter = isl_union_set_free(filter); - } else if (!parallelism) { - isl_union_set *filter_i; - filter_i = isl_schedule_node_filter_get_filter(node); - filter = isl_union_set_union(filter, filter_i); - } else if (only_initial) - break; - node = isl_schedule_node_parent(node); - } - - isl_schedule_node_free(node); - - return filter; -} - -/* Given a set or sequence node, return the union of the filters of - * the direct subtrees that do not contain any suitably permutable bands - * (according to subtree_has_permutable_bands). - */ -static __isl_give isl_union_set *get_all_non_parallel_subtree_filters( - __isl_keep isl_schedule_node *node) -{ - return get_non_parallel_subtree_filters(node, 0); -} - -/* Given a set or sequence node, return the union of the filters of - * the initial direct subtrees that do not contain any suitably permutable - * bands (according to subtree_has_permutable_bands). - */ -static __isl_give isl_union_set *get_initial_non_parallel_subtree_filters( - __isl_keep isl_schedule_node *node) -{ - return get_non_parallel_subtree_filters(node, 1); -} - -/* Mark all variables that are accessed by the statement instances in "domain" - * and that are local to "prog" as requiring a declaration in the host code. - * The statement instances in "domain" correspond to (a subset of) - * the active instances at "node". - * "node" is not modified by this function, except that NULL is returned - * in case of error. - */ -static __isl_give isl_schedule_node *declare_accessed_local_variables( - __isl_take isl_schedule_node *node, struct gpu_prog *prog, - __isl_keep isl_union_set *domain) -{ - isl_union_pw_multi_aff *contraction; - isl_union_set *arrays; - int i; - - if (!ppcg_scop_any_hidden_declarations(prog->scop)) - return node; - contraction = isl_schedule_node_get_subtree_contraction(node); - domain = isl_union_set_copy(domain); - domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction); - arrays = accessed_by_domain(domain, prog); - - for (i = 0; i < prog->n_array; ++i) { - isl_space *space; - isl_set *set; - int empty; - - if (!prog->array[i].local) - continue; - space = isl_set_get_space(prog->array[i].extent); - set = isl_union_set_extract_set(arrays, space); - empty = isl_set_plain_is_empty(set); - isl_set_free(set); - if (empty < 0) - goto error; - if (!empty) - prog->array[i].declare_local = 1; - } - - isl_union_set_free(arrays); - return node; -error: - isl_union_set_free(arrays); - return isl_schedule_node_free(node); -} - -/* If "node" points to a set node, then separate its children - * into subtrees that have suitably permutable bands and - * those that do not. - * Adjust the schedule tree in order to execute the second group - * after the first group and return a pointer to the first group, - * assuming there are any such subtrees. - * If "node" points to a sequence node, then separate the initial - * children that do not have suitably permutable bands and - * return a pointer to the subsequence of children that do have such bands, - * assuming there are any such subtrees. - * - * In both cases, mark all local variables in "prog" that are accessed by - * the group without permutable bands as requiring a declaration on the host. - */ -static __isl_give isl_schedule_node *isolate_permutable_subtrees( - __isl_take isl_schedule_node *node, struct gpu_prog *prog) -{ - isl_union_set *filter; - enum isl_schedule_node_type type; - - if (!node) - return NULL; - type = isl_schedule_node_get_type(node); - if (type == isl_schedule_node_set) { - filter = get_all_non_parallel_subtree_filters(node); - node = declare_accessed_local_variables(node, prog, filter); - node = isl_schedule_node_order_after(node, filter); - } else if (type == isl_schedule_node_sequence) { - filter = get_initial_non_parallel_subtree_filters(node); - node = declare_accessed_local_variables(node, prog, filter); - node = isl_schedule_node_order_before(node, filter); - } - - return node; -} - -/* Replace any reference to an array element in the range of "copy" - * by a reference to all array elements (defined by the extent of the array). - */ -static __isl_give isl_union_map *approximate_copy_out( - __isl_take isl_union_map *copy, struct gpu_prog *prog) -{ - int i; - isl_union_map *res; - - res = isl_union_map_empty(isl_union_map_get_space(copy)); - - for (i = 0; i < prog->n_array; ++i) { - isl_space *space; - isl_set *set; - isl_union_map *copy_i; - isl_union_set *extent, *domain; - - space = isl_space_copy(prog->array[i].space); - extent = isl_union_set_from_set(isl_set_universe(space)); - copy_i = isl_union_map_copy(copy); - copy_i = isl_union_map_intersect_range(copy_i, extent); - set = isl_set_copy(prog->array[i].extent); - extent = isl_union_set_from_set(set); - domain = isl_union_map_domain(copy_i); - copy_i = isl_union_map_from_domain_and_range(domain, extent); - res = isl_union_map_union(res, copy_i); - } - - isl_union_map_free(copy); - - return res; -} - -/* Insert "kernel" marks that point to a ppcg_kernel structure - * in front of all outermost tilable band that (by construction) - * have at least one parallel loop. - */ -static __isl_give isl_schedule_node *mark_kernels(struct gpu_gen *gen, - __isl_take isl_schedule_node *node) -{ - return isl_schedule_node_map_descendant_bottom_up(node, - &mark_outer_permutable, gen); -} - -/* Construct schedule constraints from the dependences in prog->scop and - * the array order dependences in prog->array_order. - * - * If live range reordering is allowed, then we need to make sure - * that live ranges on arrays are not run in parallel since doing - * so would require array expansion. We therefore add the array - * order dependences to the coincidence dependences. Non-zero array - * order dependences will then prevent a schedule dimension from being - * considered parallel. - * Live ranges derived from scalars are allowed to be run in parallel - * since we force the scalars to be mapped to private memory in - * check_scalar_live_ranges. - * If live range reordering is allowed, then the false dependences - * are not added to the validity constraints as that would prevent - * reordering. Instead, the external false dependences that enforce that reads - * from potentially live-in data precede any later write and - * that writes of potentially live-out data follow any other earlier write - * are added to the validity and the coincidence constraints. - * The false dependences are still added to the proximity constraints - * for consistency with the case where live range reordering is not allowed. - * The coincidence constraints then consist of flow dependences, - * external false dependences and array order dependences. - * The independences can be filtered out from the first two sets. - * They have already been filtered out from the array order dependences - * on a per array basis in collect_order_dependences. - * There is no need for a per array handling of the other two sets - * as there should be no flow or external false dependence on local - * variables that can be filtered out. - */ -static __isl_give isl_schedule_constraints *construct_schedule_constraints( - struct gpu_prog *prog) -{ - isl_union_set *domain; - isl_union_map *dep_raw, *dep; - isl_union_map *validity, *proximity, *coincidence; - isl_schedule_constraints *sc; - - domain = isl_union_set_copy(prog->scop->domain); - sc = isl_schedule_constraints_on_domain(domain); - sc = isl_schedule_constraints_set_context(sc, - isl_set_copy(prog->scop->context)); - if (prog->scop->options->live_range_reordering) { - sc = isl_schedule_constraints_set_conditional_validity(sc, - isl_union_map_copy(prog->scop->tagged_dep_flow), - isl_union_map_copy(prog->scop->tagged_dep_order)); - proximity = isl_union_map_copy(prog->scop->dep_flow); - validity = isl_union_map_copy(proximity); - validity = isl_union_map_union(validity, - isl_union_map_copy(prog->scop->dep_forced)); - proximity = isl_union_map_union(proximity, - isl_union_map_copy(prog->scop->dep_false)); - coincidence = isl_union_map_copy(validity); - coincidence = isl_union_map_subtract(coincidence, - isl_union_map_copy(prog->scop->independence)); - coincidence = isl_union_map_union(coincidence, - isl_union_map_copy(prog->array_order)); - } else { - dep_raw = isl_union_map_copy(prog->scop->dep_flow); - dep = isl_union_map_copy(prog->scop->dep_false); - dep = isl_union_map_union(dep, dep_raw); - dep = isl_union_map_coalesce(dep); - proximity = isl_union_map_copy(dep); - coincidence = isl_union_map_copy(dep); - validity = dep; - } - sc = isl_schedule_constraints_set_validity(sc, validity); - sc = isl_schedule_constraints_set_coincidence(sc, coincidence); - sc = isl_schedule_constraints_set_proximity(sc, proximity); - - if (prog->scop->options->debug->dump_schedule_constraints) - isl_schedule_constraints_dump(sc); - return sc; -} - -/* Compute an appropriate schedule based on the accesses in - * gen->read and gen->write. - * - * We derive schedule constraints from the dependences in gen->prog->scop - * and then use isl to compute a schedule that has a parallel loop - * in each tilable band. - * During the schedule construction, some statement instances - * may be grouped first based on the input schedule. - */ -static __isl_give isl_schedule *compute_schedule(struct gpu_gen *gen) -{ - isl_schedule_constraints *sc; - isl_schedule *schedule; - - sc = construct_schedule_constraints(gen->prog); - schedule = gen->prog->scop->schedule; - schedule = ppcg_compute_schedule(sc, schedule, gen->options); - - return schedule; -} - -/* If the band node "node" has exactly one member then mark it permutable. - */ -static __isl_give isl_schedule_node *band_set_permutable( - __isl_take isl_schedule_node *node, - __isl_keep isl_schedule_constraints *sc) -{ - if (isl_schedule_node_band_n_member(node) == 1) - node = isl_schedule_node_band_set_permutable(node, 1); - - return node; -} - -/* Return the coincidence constraints between pairs of instances - * that are scheduled together by the ancestors of "node". - * That is, select those coincidence constraints that relate - * pairs of instances that have the same value for the prefix schedule. - * If the schedule depth is zero, then the prefix schedule does not - * contain any information, so we intersect domain and range - * of the schedule constraints with the reaching domain elements instead. - */ -static __isl_give isl_union_map *get_local_coincidence( - __isl_keep isl_schedule_node *node, - __isl_keep isl_schedule_constraints *sc) -{ - isl_union_map *coincidence; - isl_multi_union_pw_aff *prefix; - isl_union_pw_multi_aff *contraction; - - coincidence = isl_schedule_constraints_get_coincidence(sc); - contraction = isl_schedule_node_get_subtree_contraction(node); - if (isl_schedule_node_get_schedule_depth(node) == 0) { - isl_union_set *domain; - - domain = isl_schedule_node_get_domain(node); - domain = isl_union_set_preimage_union_pw_multi_aff(domain, - contraction); - coincidence = isl_union_map_intersect_domain(coincidence, - isl_union_set_copy(domain)); - coincidence = isl_union_map_intersect_range(coincidence, - domain); - return coincidence; - } - - prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); - prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix, - contraction); - return isl_union_map_eq_at_multi_union_pw_aff(coincidence, prefix); -} - -/* For each member in the band node "node", determine whether - * it is coincident with respect to the outer nodes and mark - * it accordingly. - * - * That is, for each coincidence constraint between pairs - * of instances that are scheduled together by the outer nodes, - * check that domain and range are assigned the same value - * by the band member. This test is performed by checking - * that imposing the same value for the band member does not - * remove any elements from the set of coincidence constraints. - */ -static __isl_give isl_schedule_node *band_set_coincident( - __isl_take isl_schedule_node *node, - __isl_keep isl_schedule_constraints *sc) -{ - isl_union_map *coincidence; - isl_union_pw_multi_aff *contraction; - isl_multi_union_pw_aff *partial; - int i, n; - - coincidence = get_local_coincidence(node, sc); - - partial = isl_schedule_node_band_get_partial_schedule(node); - contraction = isl_schedule_node_get_subtree_contraction(node); - partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial, - contraction); - n = isl_schedule_node_band_n_member(node); - for (i = 0; i < n; ++i) { - isl_union_map *coincidence_i; - isl_union_pw_aff *upa; - isl_multi_union_pw_aff *partial_i; - int subset; - - upa = isl_multi_union_pw_aff_get_union_pw_aff(partial, i); - partial_i = isl_multi_union_pw_aff_from_union_pw_aff(upa); - coincidence_i = isl_union_map_copy(coincidence); - coincidence_i = isl_union_map_eq_at_multi_union_pw_aff( - coincidence_i, partial_i); - subset = isl_union_map_is_subset(coincidence, coincidence_i); - isl_union_map_free(coincidence_i); - - if (subset < 0) - break; - node = isl_schedule_node_band_member_set_coincident(node, i, - subset); - } - if (i < n) - node = isl_schedule_node_free(node); - isl_multi_union_pw_aff_free(partial); - isl_union_map_free(coincidence); - - return node; -} - -/* If "node" is a band, then set its properties. - * - * In particular, if the band has exactly one member, then mark it permutable. - * Mark the band member coincident based on the coincidence constraints - * of "sc". - */ -static __isl_give isl_schedule_node *set_band_properties( - __isl_take isl_schedule_node *node, void *user) -{ - isl_schedule_constraints *sc = user; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_band) - return node; - if (isl_schedule_node_band_n_member(node) == 0) - return node; - - node = band_set_permutable(node, sc); - node = band_set_coincident(node, sc); - - return node; -} - -/* Return the original schedule with all bands marked permutable and - * all band members marked coincident based on the coincidence constraints. - * The bands are explicitly marked permutable so that they will be considered - * by mark_outer_permutable. - */ -static __isl_give isl_schedule *determine_properties_original_schedule( - struct gpu_gen *gen) -{ - isl_schedule *schedule; - isl_schedule_constraints *sc; - - schedule = isl_schedule_copy(gen->prog->scop->schedule); - sc = construct_schedule_constraints(gen->prog); - schedule = isl_schedule_map_schedule_node_bottom_up(schedule, - &set_band_properties, sc); - isl_schedule_constraints_free(sc); - - return schedule; -} - -/* Compute a schedule or determine the properties of the original schedule - * depending on the value of the "reschedule" option. - */ -static __isl_give isl_schedule *compute_or_set_properties(void *user) -{ - struct gpu_gen *gen = user; - - if (gen->options->reschedule) - return compute_schedule(gen); - else - return determine_properties_original_schedule(gen); -} - -/* Obtain a schedule for the scop, by reading it from - * a file, by computing one or by determining the properties - * of the original schedule. - */ -__isl_give isl_schedule *get_schedule(struct gpu_gen *gen) -{ - return ppcg_get_schedule(gen->ctx, gen->options, - &compute_or_set_properties, gen); -} - -/* Construct the string "_". - */ -static char *concat(isl_ctx *ctx, const char *a, const char *b) -{ - isl_printer *p; - char *s; - - p = isl_printer_to_str(ctx); - p = isl_printer_print_str(p, a); - p = isl_printer_print_str(p, "_"); - p = isl_printer_print_str(p, b); - s = isl_printer_get_str(p); - isl_printer_free(p); - - return s; -} - -/* For each array in "prog" of which an element appears in "accessed" and - * that is not a read only scalar, create a zero-dimensional universe set - * of which the tuple id has name "_" and a user - * pointer pointing to the array (gpu_array_info). - * - * If the array is local to "prog", then make sure it will be declared - * in the host code. - * - * Return the list of these universe sets. - */ -static __isl_give isl_union_set_list *create_copy_filters(struct gpu_prog *prog, - const char *prefix, __isl_take isl_union_set *accessed) -{ - int i; - isl_ctx *ctx; - isl_union_set_list *filters; - - ctx = prog->ctx; - filters = isl_union_set_list_alloc(ctx, 0); - for (i = 0; i < prog->n_array; ++i) { - struct gpu_array_info *array = &prog->array[i]; - isl_space *space; - isl_set *accessed_i; - int empty; - char *name; - isl_id *id; - isl_union_set *uset; - - if (gpu_array_is_read_only_scalar(array)) - continue; - - space = isl_space_copy(array->space); - accessed_i = isl_union_set_extract_set(accessed, space); - empty = isl_set_plain_is_empty(accessed_i); - isl_set_free(accessed_i); - if (empty < 0) { - filters = isl_union_set_list_free(filters); - break; - } - if (empty) - continue; - - array->global = 1; - if (array->local) - array->declare_local = 1; - - name = concat(ctx, prefix, array->name); - id = name ? isl_id_alloc(ctx, name, array) : NULL; - free(name); - space = isl_space_set_alloc(ctx, 0, 0); - space = isl_space_set_tuple_id(space, isl_dim_set, id); - uset = isl_union_set_from_set(isl_set_universe(space)); - - filters = isl_union_set_list_add(filters, uset); - } - isl_union_set_free(accessed); - - return filters; -} - -/* Make sure that code for the statements in "filters" that - * copy arrays to or from the device is only generated when - * the size of the corresponding array is positive. - * That is, add a set node underneath "graft" with "filters" as children - * and for each child add a guard that the selects the parameter - * values for which the corresponding array has a positive size. - * The array is available in the user pointer of the statement identifier. - * "depth" is the schedule depth of the position where "graft" - * will be added. - */ -static __isl_give isl_schedule_node *insert_positive_size_guards( - __isl_take isl_schedule_node *graft, - __isl_take isl_union_set_list *filters, int depth) -{ - int i, n; - - graft = isl_schedule_node_child(graft, 0); - graft = isl_schedule_node_insert_set(graft, filters); - n = isl_schedule_node_n_children(graft); - for (i = 0; i < n; ++i) { - isl_union_set *filter; - isl_set *domain, *guard; - isl_id *id; - struct gpu_array_info *array; - - graft = isl_schedule_node_child(graft, i); - filter = isl_schedule_node_filter_get_filter(graft); - domain = isl_set_from_union_set(filter); - id = isl_set_get_tuple_id(domain); - array = isl_id_get_user(id); - isl_id_free(id); - isl_set_free(domain); - guard = gpu_array_positive_size_guard(array); - guard = isl_set_from_params(guard); - guard = isl_set_add_dims(guard, isl_dim_set, depth); - graft = isl_schedule_node_child(graft, 0); - graft = isl_schedule_node_insert_guard(graft, guard); - graft = isl_schedule_node_parent(graft); - graft = isl_schedule_node_parent(graft); - } - graft = isl_schedule_node_parent(graft); - - return graft; -} - -/* Create a graft for copying arrays to or from the device, - * whenever the size of the array is strictly positive. - * Each statement is called "_" and - * the identifier has a user pointer pointing to the array. - * The graft will be added at the position specified by "node". - * "copy" contains the array elements that need to be copied. - * Only arrays of which some elements need to be copied - * will have a corresponding statement in the graph. - * Note though that each such statement will copy the entire array. - */ -static __isl_give isl_schedule_node *create_copy_device(struct gpu_prog *prog, - __isl_keep isl_schedule_node *node, const char *prefix, - __isl_take isl_union_set *copy) -{ - int depth; - isl_ctx *ctx; - isl_space *space; - isl_union_set *all, *domain; - isl_union_set_list *filters; - isl_union_map *extension; - isl_schedule_node *graft; - - ctx = prog->ctx; - depth = isl_schedule_node_get_schedule_depth(node); - filters = create_copy_filters(prog, prefix, copy); - all = isl_union_set_list_union(isl_union_set_list_copy(filters)); - - space = depth < 0 ? NULL : isl_space_set_alloc(ctx, 0, depth); - domain = isl_union_set_from_set(isl_set_universe(space)); - extension = isl_union_map_from_domain_and_range(domain, all); - graft = isl_schedule_node_from_extension(extension); - - if (!filters) - return isl_schedule_node_free(graft); - if (isl_union_set_list_n_union_set(filters) == 0) { - isl_union_set_list_free(filters); - return graft; - } - - return insert_positive_size_guards(graft, filters, depth); -} - -/* Return (the universe spaces of) the arrays that are declared - * inside the scop corresponding to "prog" and for which all - * potential writes inside the scop form a subset of "domain". - */ -static __isl_give isl_union_set *extract_local_accesses(struct gpu_prog *prog, - __isl_keep isl_union_set *domain) -{ - int i; - isl_union_set *local; - - local = isl_union_set_empty(isl_union_set_get_space(domain)); - - for (i = 0; i < prog->n_array; ++i) { - isl_set *set; - isl_union_map *to_outer; - isl_union_map *may_write; - isl_union_set *write_domain; - isl_union_set *fields; - int subset; - - if (!prog->array[i].local) - continue; - - set = isl_set_universe(isl_space_copy(prog->array[i].space)); - to_outer = isl_union_map_copy(prog->to_outer); - to_outer = isl_union_map_intersect_range(to_outer, - isl_union_set_from_set(isl_set_copy(set))); - fields = isl_union_map_domain(to_outer); - may_write = isl_union_map_copy(prog->may_write); - may_write = isl_union_map_intersect_range(may_write, fields); - write_domain = isl_union_map_domain(may_write); - subset = isl_union_set_is_subset(write_domain, domain); - isl_union_set_free(write_domain); - - if (subset < 0) { - isl_set_free(set); - return isl_union_set_free(local); - } else if (subset) { - local = isl_union_set_add_set(local, set); - } else { - isl_set_free(set); - } - } - - return local; -} - -/* Internal data structure for node_may_persist. - * - * "tagger" maps tagged iteration domains to the corresponding untagged - * iteration domain. - * - * "may_persist_flow" is the set of all tagged dataflow dependences - * with those dependences removed that either precede or follow - * the kernel launch in a sequence. - * "inner_band_flow" is the set of all tagged dataflow dependences - * that are local to a given iteration of the outer band nodes - * with respect to the current node. - * "local_flow" is equal to "inner_band_flow", except that the domain - * and the range have been intersected with intermediate filters - * on children of sets or sequences. - */ -struct ppcg_may_persist_data { - isl_union_pw_multi_aff *tagger; - - isl_union_map *local_flow; - isl_union_map *inner_band_flow; - isl_union_map *may_persist_flow; -}; - -/* Update the information in "data" based on the band ancestor "node". - * - * In particular, we restrict the dependences in data->local_flow - * to those dependence where the source and the sink occur in - * the same iteration of the given band node. - * We also update data->inner_band_flow to the new value of - * data->local_flow. - */ -static int update_may_persist_at_band(__isl_keep isl_schedule_node *node, - struct ppcg_may_persist_data *data) -{ - isl_multi_union_pw_aff *partial; - isl_union_pw_multi_aff *contraction; - isl_union_map *flow; - - if (isl_schedule_node_band_n_member(node) == 0) - return 0; - - partial = isl_schedule_node_band_get_partial_schedule(node); - contraction = isl_schedule_node_get_subtree_contraction(node); - partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial, - contraction); - partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial, - isl_union_pw_multi_aff_copy(data->tagger)); - - flow = data->local_flow; - flow = isl_union_map_eq_at_multi_union_pw_aff(flow, partial); - data->local_flow = flow; - - isl_union_map_free(data->inner_band_flow); - data->inner_band_flow = isl_union_map_copy(data->local_flow); - - return 0; -} - -/* Given a set of local reaching domain elements "domain", - * expand them to the corresponding leaf domain elements using "contraction" - * and insert the array references tags using data->tagger. - */ -static __isl_give isl_union_set *expand_and_tag( - __isl_take isl_union_set *domain, - __isl_take isl_union_pw_multi_aff *contraction, - struct ppcg_may_persist_data *data) -{ - domain = isl_union_set_preimage_union_pw_multi_aff(domain, - contraction); - domain = isl_union_set_preimage_union_pw_multi_aff(domain, - isl_union_pw_multi_aff_copy(data->tagger)); - return domain; -} - -/* Given a filter node that is the child of a set or sequence node, - * restrict data->local_flow to refer only to those elements - * in the filter of the node. - * "contraction" maps the leaf domain elements of the schedule tree - * to the corresponding domain elements at (the parent of) "node". - */ -static int filter_flow(__isl_keep isl_schedule_node *node, - struct ppcg_may_persist_data *data, - __isl_take isl_union_pw_multi_aff *contraction) -{ - isl_union_set *filter; - isl_union_map *flow; - - flow = data->local_flow; - filter = isl_schedule_node_filter_get_filter(node); - filter = expand_and_tag(filter, contraction, data); - flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(filter)); - flow = isl_union_map_intersect_range(flow, filter); - data->local_flow = flow; - - return 0; -} - -/* Given a filter node "node", collect the filters on all preceding siblings - * (which are also filter nodes), add them to "filters" and return the result. - */ -static __isl_give isl_union_set *add_previous_filters( - __isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node) -{ - isl_schedule_node *sibling; - - sibling = isl_schedule_node_copy(node); - while (sibling && isl_schedule_node_has_previous_sibling(sibling)) { - isl_union_set *filter; - - sibling = isl_schedule_node_previous_sibling(sibling); - filter = isl_schedule_node_filter_get_filter(sibling); - filters = isl_union_set_union(filters, filter); - } - isl_schedule_node_free(sibling); - if (!sibling) - return isl_union_set_free(filters); - - return filters; -} - -/* Given a filter node "node", collect the filters on all following siblings - * (which are also filter nodes), add them to "filters" and return the result. - */ -static __isl_give isl_union_set *add_next_filters( - __isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node) -{ - isl_schedule_node *sibling; - - sibling = isl_schedule_node_copy(node); - while (sibling && isl_schedule_node_has_next_sibling(sibling)) { - isl_union_set *filter; - - sibling = isl_schedule_node_next_sibling(sibling); - filter = isl_schedule_node_filter_get_filter(sibling); - filters = isl_union_set_union(filters, filter); - } - isl_schedule_node_free(sibling); - if (!sibling) - return isl_union_set_free(filters); - - return filters; -} - -/* Remove those flow dependences from data->may_persist_flow - * that flow between elements of "domain" within the same iteration - * of all outer band nodes. - * "contraction" maps the leaf domain elements of the schedule tree - * to the corresponding elements "domain". - */ -static void remove_external_flow(struct ppcg_may_persist_data *data, - __isl_take isl_union_set *domain, - __isl_keep isl_union_pw_multi_aff *contraction) -{ - isl_union_map *flow; - - contraction = isl_union_pw_multi_aff_copy(contraction); - domain = expand_and_tag(domain, contraction, data); - flow = isl_union_map_copy(data->local_flow); - flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(domain)); - flow = isl_union_map_intersect_range(flow, domain); - - data->may_persist_flow = isl_union_map_subtract(data->may_persist_flow, - flow); -} - -/* Update the information in "data" based on the filter ancestor "node". - * We only need to modify anything if the filter is the child - * of a set or sequence node. - * - * In the case of a sequence, we remove the dependences between - * statement instances that are both executed either before or - * after the subtree that will be mapped to a kernel, within - * the same iteration of outer bands. - * - * In both cases, we restrict data->local_flow to the current child. - */ -static int update_may_persist_at_filter(__isl_keep isl_schedule_node *node, - struct ppcg_may_persist_data *data) -{ - enum isl_schedule_node_type type; - isl_schedule_node *parent; - isl_space *space; - isl_union_pw_multi_aff *contraction; - isl_union_set *before, *after, *filter; - - type = isl_schedule_node_get_parent_type(node); - if (type != isl_schedule_node_sequence && type != isl_schedule_node_set) - return 0; - - parent = isl_schedule_node_copy(node); - parent = isl_schedule_node_parent(parent); - contraction = isl_schedule_node_get_subtree_contraction(parent); - isl_schedule_node_free(parent); - - if (type == isl_schedule_node_set) - return filter_flow(node, data, contraction); - - filter = isl_schedule_node_filter_get_filter(node); - space = isl_union_set_get_space(filter); - isl_union_set_free(filter); - before = isl_union_set_empty(space); - after = isl_union_set_copy(before); - before = add_previous_filters(before, node); - after = add_next_filters(after, node); - - remove_external_flow(data, before, contraction); - remove_external_flow(data, after, contraction); - - return filter_flow(node, data, contraction); -} - -/* Update the information in "data" based on the ancestor "node". - */ -static isl_stat update_may_persist_at(__isl_keep isl_schedule_node *node, - void *user) -{ - struct ppcg_may_persist_data *data = user; - - switch (isl_schedule_node_get_type(node)) { - case isl_schedule_node_error: - return isl_stat_error; - case isl_schedule_node_context: - case isl_schedule_node_domain: - case isl_schedule_node_expansion: - case isl_schedule_node_extension: - case isl_schedule_node_guard: - case isl_schedule_node_leaf: - case isl_schedule_node_mark: - case isl_schedule_node_sequence: - case isl_schedule_node_set: - break; - case isl_schedule_node_band: - if (update_may_persist_at_band(node, data) < 0) - return isl_stat_error; - break; - case isl_schedule_node_filter: - if (update_may_persist_at_filter(node, data) < 0) - return isl_stat_error; - break; - } - - return isl_stat_ok; -} - -/* Determine the set of array elements that may need to be perserved - * by a kernel constructed from the subtree at "node". - * This includes the set of array elements that may need to be preserved - * by the entire scop (prog->may_persist) and the elements for which - * there is a potential flow dependence that may cross a kernel launch. - * - * To determine the second set, we start from all flow dependences. - * From this set of dependences, we remove those that cannot possibly - * require data to be preserved by a kernel launch. - * In particular, we consider the following sets of dependences. - * - dependences of which the write occurs inside the kernel. - * If the data is needed outside the kernel, then it will - * be copied out immediately after the kernel launch, so there - * is no need for any special care. - * - dependences of which the read occurs inside the kernel and the - * corresponding write occurs inside the same iteration of the - * outer band nodes. This means that the data is needed in - * the first kernel launch after the write, which is already - * taken care of by the standard copy-in. That is, the data - * do not need to be preserved by any intermediate call to - * the same kernel. - * - dependences of which the write and the read either both occur - * before the kernel launch or both occur after the kernel launch, - * within the same iteration of the outer band nodes with respect - * to the sequence that determines the ordering of the dependence - * and the kernel launch. Such flow dependences cannot cross - * any kernel launch. - * - * For the remaining (tagged) dependences, we take the domain - * (i.e., the tagged writes) and apply the tagged access relation - * to obtain the accessed data elements. - * These are then combined with the elements that may need to be - * preserved by the entire scop. - */ -static __isl_give isl_union_set *node_may_persist( - __isl_keep isl_schedule_node *node, struct gpu_prog *prog) -{ - struct ppcg_may_persist_data data; - isl_union_pw_multi_aff *contraction; - isl_union_set *domain; - isl_union_set *persist; - isl_union_map *flow, *local_flow; - - data.tagger = prog->scop->tagger; - - flow = isl_union_map_copy(prog->scop->tagged_dep_flow); - data.local_flow = isl_union_map_copy(flow); - data.inner_band_flow = isl_union_map_copy(flow); - data.may_persist_flow = flow; - if (isl_schedule_node_foreach_ancestor_top_down(node, - &update_may_persist_at, &data) < 0) - data.may_persist_flow = - isl_union_map_free(data.may_persist_flow); - flow = data.may_persist_flow; - isl_union_map_free(data.local_flow); - - domain = isl_schedule_node_get_domain(node); - contraction = isl_schedule_node_get_subtree_contraction(node); - domain = isl_union_set_preimage_union_pw_multi_aff(domain, - contraction); - domain = isl_union_set_preimage_union_pw_multi_aff(domain, - isl_union_pw_multi_aff_copy(data.tagger)); - flow = isl_union_map_subtract_domain(flow, isl_union_set_copy(domain)); - local_flow = data.inner_band_flow; - local_flow = isl_union_map_intersect_range(local_flow, domain); - flow = isl_union_map_subtract(flow, local_flow); - - persist = isl_union_map_domain(flow); - persist = isl_union_set_apply(persist, - isl_union_map_copy(prog->scop->tagged_may_writes)); - persist = isl_union_set_union(persist, - isl_union_set_copy(prog->may_persist)); - - return persist; -} - -/* Add nodes for copying outer arrays in and out of the device - * before and after the subtree "node", which contains one or more kernels. - * "domain" contains the original statement instances, i.e., - * those that correspond to the domains of the access relations in "prog". - * In particular, the domain has not been contracted in any way. - * "prefix" contains the prefix schedule at that point, in terms - * of the same original statement instances. - * - * We first compute the sets of outer array elements that need - * to be copied in and out and then graft in the nodes for - * performing this copying. - * - * In particular, for each array that is possibly written anywhere in - * the subtree "node" and that may be used after "node" - * or that may be visible outside the corresponding scop, - * we copy out its entire extent. - * - * Any array elements that is read without first being written inside - * the subtree "node" needs to be copied in. - * Furthermore, if there are any array elements that - * are copied out, but that may not be written inside "node, then - * they also need to be copied in to ensure that the value after execution - * is the same as the value before execution, at least for those array - * elements that may have their values preserved by the scop or that - * may be written before "node" and read after "node". - * In case the array elements are structures, we need to take into - * account that all members of the structures need to be written - * by "node" before we can avoid copying the data structure in. - * - * Note that the may_write relation is intersected with the domain, - * which has been intersected with the context. - * This helps in those cases where the arrays are declared with a fixed size, - * while the accesses are parametric and the context assigns a fixed value - * to the parameters. - * - * If an element from a local array is read without first being written, - * then there is no point in copying it in since it cannot have been - * written prior to the scop. Warn about the uninitialized read instead. - */ -static __isl_give isl_schedule_node *add_to_from_device( - __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain, - __isl_take isl_union_map *prefix, struct gpu_prog *prog) -{ - isl_union_set *local; - isl_union_set *may_persist; - isl_union_map *may_write, *must_write, *copy_out, *not_written; - isl_union_map *read, *copy_in; - isl_union_map *tagged; - isl_union_map *local_uninitialized; - isl_schedule_node *graft; - - tagged = isl_union_map_copy(prog->scop->tagged_reads); - tagged = isl_union_map_union(tagged, - isl_union_map_copy(prog->scop->tagged_may_writes)); - - may_write = isl_union_map_copy(prog->may_write); - may_write = isl_union_map_intersect_domain(may_write, - isl_union_set_copy(domain)); - may_write = remove_local_accesses(prog, - isl_union_map_copy(tagged), may_write, - isl_union_map_copy(prefix), 0); - may_write = isl_union_map_apply_range(may_write, - isl_union_map_copy(prog->to_outer)); - may_write = isl_union_map_apply_domain(may_write, - isl_union_map_copy(prefix)); - may_write = approximate_copy_out(may_write, prog); - copy_out = isl_union_map_copy(may_write); - may_write = isl_union_map_apply_range(may_write, - isl_union_map_copy(prog->to_inner)); - must_write = isl_union_map_copy(prog->must_write); - must_write = isl_union_map_apply_domain(must_write, - isl_union_map_copy(prefix)); - may_persist = node_may_persist(node, prog); - may_write = isl_union_map_intersect_range(may_write, may_persist); - not_written = isl_union_map_subtract(may_write, must_write); - - local = extract_local_accesses(prog, domain); - read = isl_union_map_copy(prog->read); - read = isl_union_map_intersect_domain(read, domain); - read = remove_local_accesses(prog, tagged, read, - isl_union_map_copy(prefix), 1); - local = isl_union_set_apply(local, isl_union_map_copy(prog->to_inner)); - local_uninitialized = isl_union_map_copy(prog->scop->live_in); - local_uninitialized = isl_union_map_intersect_range(local_uninitialized, - local); - local_uninitialized = isl_union_map_intersect(local_uninitialized, - isl_union_map_copy(read)); - if (!isl_union_map_is_empty(local_uninitialized)) { - fprintf(stderr, - "possibly uninitialized reads (not copied in):\n"); - isl_union_map_dump(local_uninitialized); - } - read = isl_union_map_subtract(read, local_uninitialized); - read = isl_union_map_apply_domain(read, prefix); - copy_in = isl_union_map_union(read, not_written); - copy_in = isl_union_map_apply_range(copy_in, - isl_union_map_copy(prog->to_outer)); - - graft = create_copy_device(prog, node, "to_device", - isl_union_map_range(copy_in)); - node = isl_schedule_node_graft_before(node, graft); - graft = create_copy_device(prog, node, "from_device", - isl_union_map_range(copy_out)); - node = isl_schedule_node_graft_after(node, graft); - - return node; -} - -/* Add nodes for initializing ("init_device") and clearing ("clear_device") - * the device before and after "node". - */ -static __isl_give isl_schedule_node *add_init_clear_device( - __isl_take isl_schedule_node *node) -{ - isl_ctx *ctx; - isl_space *space; - isl_union_set *domain; - isl_schedule_node *graft; - - ctx = isl_schedule_node_get_ctx(node); - - space = isl_space_set_alloc(ctx, 0, 0); - space = isl_space_set_tuple_name(space, isl_dim_set, "init_device"); - domain = isl_union_set_from_set(isl_set_universe(space)); - graft = isl_schedule_node_from_domain(domain); - - node = isl_schedule_node_graft_before(node, graft); - - space = isl_space_set_alloc(ctx, 0, 0); - space = isl_space_set_tuple_name(space, isl_dim_set, "clear_device"); - domain = isl_union_set_from_set(isl_set_universe(space)); - graft = isl_schedule_node_from_domain(domain); - - node = isl_schedule_node_graft_after(node, graft); - - return node; -} - -/* Update "schedule" for mapping to a GPU device. - * - * In particular, insert a context node, create kernels for - * each outermost tilable band and introduce nodes for copying arrays - * in and out of the device and for initializing and clearing the device. - * If the child of the initial root points to a set node, - * then children of this node that do not contain any tilable bands - * are separated from the other children and are not mapped to - * the device. - * - * The GPU code is generated in a context where at least one - * statement instance is executed. The corresponding guard is inserted - * around the entire schedule. - */ -__isl_give isl_schedule *map_to_device(struct gpu_gen *gen, - __isl_take isl_schedule *schedule, int to_from_device) -{ - isl_schedule_node *node; - isl_set *context; - isl_set *guard; - isl_union_set *domain; - isl_union_map *prefix; - isl_union_pw_multi_aff *contraction; - struct gpu_prog *prog; - - context = isl_set_copy(gen->prog->context); - context = isl_set_from_params(context); - schedule = isl_schedule_insert_context(schedule, context); - - prog = gen->prog; - guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain)); - prog->context = isl_set_intersect(prog->context, isl_set_copy(guard)); - guard = isl_set_from_params(guard); - - node = isl_schedule_get_root(schedule); - isl_schedule_free(schedule); - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_child(node, 0); - node = isolate_permutable_subtrees(node, gen->prog); - domain = isl_schedule_node_get_domain(node); - contraction = isl_schedule_node_get_subtree_contraction(node); - domain = isl_union_set_preimage_union_pw_multi_aff(domain, - isl_union_pw_multi_aff_copy(contraction)); - prefix = isl_schedule_node_get_prefix_schedule_union_map(node); - prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, - contraction); - node = mark_kernels(gen, node); - if (to_from_device) { - node = add_to_from_device(node, domain, prefix, gen->prog); - } else { - isl_union_set_free(domain); - isl_union_map_free(prefix); - } - node = isl_schedule_node_root(node); - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_insert_guard(node, guard); - node = isl_schedule_node_child(node, 0); - node = add_init_clear_device(node); - schedule = isl_schedule_node_get_schedule(node); - isl_schedule_node_free(node); - - return schedule; -} - -/* Internal data structure for extract_access. - * "next_access" points to the end of a linked list that is extended - * by extract_access. - * "single_expression" is set if the access expressions belong to - * an expression statement (i.e., a statement without internal control). - * "any_to_outer" maps all intermediate arrays to their outer arrays. - */ -struct ppcg_extract_access_data { - struct gpu_stmt_access **next_access; - int single_expression; - isl_union_map *any_to_outer; -}; - -/* Given a tagged access relation to a single array "tagged", extract it - * as a map, taking into account that the input may be empty. - * If the access relation is empty, then it does not contain - * any space information, so we try to recover it from the index - * expression. - * The space of the index expression is of the form I -> A, - * with I the statement instances and A the array, or [I -> F] -> A, - * with F the filters corresponding to arguments. - * We first drop F, if present, obtaining I -> A. - * Then we construct I -> R, with R the reference tag, - * combine the two into I -> [R -> A] and uncurry to obtain - * the final result [I -> R] -> A. - * Note that the index expression may have a lower dimension - * than that of the array, but this dimension is not used - * if the access relation is empty. - */ -static __isl_give isl_map *extract_single_tagged_access( - __isl_take isl_union_map *tagged, __isl_keep pet_expr *expr) -{ - int empty; - isl_id *id; - isl_space *space, *space2; - isl_multi_pw_aff *index; - - empty = isl_union_map_is_empty(tagged); - if (empty < 0) - goto error; - if (!empty) - return isl_map_from_union_map(tagged); - isl_union_map_free(tagged); - - index = pet_expr_access_get_index(expr); - space = isl_multi_pw_aff_get_space(index); - isl_multi_pw_aff_free(index); - if (isl_space_domain_is_wrapping(space)) - space = isl_space_domain_factor_domain(space); - space2 = isl_space_copy(space); - space2 = isl_space_from_domain(isl_space_domain(space)); - id = pet_expr_access_get_ref_id(expr); - space2 = isl_space_set_tuple_id(space2, isl_dim_out, id); - space = isl_space_range_product(space2, space); - space = isl_space_uncurry(space); - - return isl_map_empty(space); -error: - isl_union_map_free(tagged); - return NULL; -} - -/* Does the index expression "index" of "expr" represent an access - * to a single element? - * That is, is "index" completely specified? - * - * If "expr" accesses elements from different spaces (i.e., fields - * of a structure), then it does not access a single element. - * Otherwise, if the single space of the access matches the space - * of "index", then the index expression is completely specified - * (no pointer to a lower-dimensional slice of the accessed array) - * and a single element is being accessed. - */ -static isl_bool complete_index(__isl_keep pet_expr *expr, - __isl_keep isl_multi_pw_aff *index) -{ - isl_union_map *read, *write, *all; - isl_map *map; - isl_space *space1, *space2; - isl_bool complete; - - read = pet_expr_access_get_may_read(expr); - write = pet_expr_access_get_may_write(expr); - all = isl_union_map_union(read, write); - if (!all) - return isl_bool_error; - if (isl_union_map_n_map(all) != 1) { - isl_union_map_free(all); - return isl_bool_false; - } - map = isl_map_from_union_map(all); - space1 = isl_map_get_space(map); - isl_map_free(map); - space2 = isl_multi_pw_aff_get_space(index); - complete = isl_space_tuple_is_equal(space1, isl_dim_out, - space2, isl_dim_out); - isl_space_free(space1); - isl_space_free(space2); - - return complete; -} - -/* Does "expr" access a single, fixed element (independently of the statement - * instance)? - * That is, does it have a completely specified constant index expression? - * - * Note that it is not sufficient for the index expression to be - * piecewise constant. isl_multi_pw_aff_is_cst can therefore not be used. - */ -static isl_bool accesses_fixed_element(__isl_keep pet_expr *expr) -{ - int i, n; - isl_multi_pw_aff *index; - isl_bool fixed = isl_bool_true; - - index = pet_expr_access_get_index(expr); - if (index < 0) - return isl_bool_error; - n = isl_multi_pw_aff_dim(index, isl_dim_out); - for (i = 0; i < n; ++i) { - isl_pw_aff *pa; - - pa = isl_multi_pw_aff_get_pw_aff(index, 0); - fixed = isl_pw_aff_n_piece(pa) == 1; - if (fixed) - fixed = isl_pw_aff_is_cst(pa); - isl_pw_aff_free(pa); - if (fixed < 0 || !fixed) - break; - } - if (fixed >= 0 && fixed) - fixed = complete_index(expr, index); - isl_multi_pw_aff_free(index); - - return fixed; -} - -/* Extract a gpu_stmt_access from "expr", append it to the list - * that ends in *data->next_access and update the end of the list. - * If the access expression performs a write, then it is considered - * exact only if it appears in a single expression statement and - * if its may access relation is equal to its must access relation. - * - * The combined set of may accesses may be a union if member accesses - * are involved, but the entire set is derived from a single reference and - * therefore from a single index expression. These accesses therefore - * all map to the same outer array. - */ -static int extract_access(__isl_keep pet_expr *expr, void *user) -{ - struct ppcg_extract_access_data *data = user; - isl_union_map *tagged; - struct gpu_stmt_access *access; - isl_ctx *ctx = pet_expr_get_ctx(expr); - isl_multi_pw_aff *index; - - access = isl_alloc_type(ctx, struct gpu_stmt_access); - assert(access); - access->next = NULL; - access->read = pet_expr_access_is_read(expr); - access->write = pet_expr_access_is_write(expr); - tagged = pet_expr_access_get_tagged_may_read(expr); - tagged = isl_union_map_union(tagged, - pet_expr_access_get_tagged_may_write(expr)); - tagged = isl_union_map_apply_range(tagged, - isl_union_map_copy(data->any_to_outer)); - if (!access->write) { - access->exact_write = 1; - } else if (!data->single_expression) { - access->exact_write = 0; - } else { - isl_union_map *must, *may; - may = isl_union_map_copy(tagged); - may = isl_union_map_domain_factor_domain(may); - must = pet_expr_access_get_must_write(expr); - access->exact_write = isl_union_map_is_equal(must, may); - isl_union_map_free(must); - isl_union_map_free(may); - } - index = pet_expr_access_get_index(expr); - access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out); - isl_multi_pw_aff_free(index); - access->ref_id = pet_expr_access_get_ref_id(expr); - access->tagged_access = extract_single_tagged_access(tagged, expr); - access->access = isl_map_copy(access->tagged_access); - access->access = isl_map_domain_factor_domain(access->access); - access->fixed_element = accesses_fixed_element(expr); - - *data->next_access = access; - data->next_access = &(*data->next_access)->next; - - if (!access->access || access->fixed_element < 0) - return -1; - - return 0; -} - -/* Construct a linked list of gpu_stmt_access objects, - * one for each access expression in the statement body. - * "any_to_outer" maps all intermediate arrays to their outer arrays. - */ -static int pet_stmt_extract_accesses(struct gpu_stmt *stmt, - __isl_keep isl_union_map *any_to_outer) -{ - struct ppcg_extract_access_data data; - - stmt->accesses = NULL; - data.next_access = &stmt->accesses; - data.single_expression = - pet_tree_get_type(stmt->stmt->body) == pet_tree_expr; - data.any_to_outer = any_to_outer; - return pet_tree_foreach_access_expr(stmt->stmt->body, - &extract_access, &data); -} - -/* Has statement "stmt" been killed from "scop"? - * That is, is the instance set of "scop" free from any - * instances of "stmt"? - */ -static isl_bool is_stmt_killed(struct ppcg_scop *scop, struct pet_stmt *stmt) -{ - isl_space *space; - isl_set *left; - isl_bool empty; - - if (!scop || !stmt) - return isl_bool_error; - space = isl_set_get_space(stmt->domain); - left = isl_union_set_extract_set(scop->domain, space); - empty = isl_set_plain_is_empty(left); - isl_set_free(left); - - return empty; -} - -/* Return an array of gpu_stmt representing the statements in "scop". - * Do not collect array accesses for statements that have been killed. - */ -static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop, - __isl_keep isl_union_map *any_to_outer) -{ - int i; - struct gpu_stmt *stmts; - - stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->pet->n_stmt); - if (!stmts) - return NULL; - - for (i = 0; i < scop->pet->n_stmt; ++i) { - struct gpu_stmt *s = &stmts[i]; - isl_bool killed; - - s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain); - s->stmt = scop->pet->stmts[i]; - killed = is_stmt_killed(scop, scop->pet->stmts[i]); - if (killed < 0) - return free_stmts(stmts, i + 1); - if (killed) - continue; - if (pet_stmt_extract_accesses(s, any_to_outer) < 0) - return free_stmts(stmts, i + 1); - } - - return stmts; -} - -/* Generate CUDA code for "scop" and print it to "p". - * After generating an AST for the transformed scop as explained below, - * we call "gen->print" to print the AST in the desired output format - * to "p". - * - * If it turns out that it does not make sense to generate GPU code, - * then we generate CPU code instead. - * - * The declarations of the arrays that are visible outside of the scop - * are printed outside of the code generated from the schedule, - * because the generated code may involve a guard around the entire code. - * - * We first compute a schedule that respects the dependences - * of the original program and select the outermost bands - * of tilable dimensions that have at least one parallel loop. - * If the --load-schedule is specified, then the loaded schedule - * is used instead of a computed schedule. - * - * Each of these bands B is then tiled according to "tile" sizes, resulting - * in two nested bands, with a kernel marker on top - * - * K - * | - * T - * | - * P - * - * We then split off at most 2 parallel dimensions from the T band and - * at most 3 parallel dimension from the P band - * - * K - * | - * T - * T1 - * | - * T2 - * | - * P1 - * | - * P2 - * - * A filter is introduced in front of T1 that maps the domain instances - * to block identifiers. Similarly, a filter is introduced in front of P1 - * that maps the domain instances to thread identifiers. - * - * For each iteration of the T2 band and for each array, we compute - * the array elements accessed by that iteration, construct a rectangular - * box around it and shift it to the origin. The result is used - * as shared memory for the array. - * - * Copying and synchronization statements are added to this schedule tree. - * In principle, these are added in front of the P1 band, but some of - * them may get hoisted up to higher levels. - * - * The entire AST is then generated from the single resulting schedule tree. - * During the generation the subtrees at kernel nodes (K) are saved - * aside and replaced by kernel calls. The result is printed as host code - * while the saved subtrees are printed as device code. - */ -static __isl_give isl_printer *generate(__isl_take isl_printer *p, - struct gpu_gen *gen, struct ppcg_scop *scop, - struct ppcg_options *options) -{ - struct gpu_prog *prog; - isl_ctx *ctx; - isl_schedule *schedule; - int any_permutable; - - if (!scop) - return isl_printer_free(p); - - ctx = isl_printer_get_ctx(p); - prog = gpu_prog_alloc(ctx, scop); - if (!prog) - return isl_printer_free(p); - - gen->prog = prog; - schedule = get_schedule(gen); - - any_permutable = has_any_permutable_node(schedule); - if (any_permutable < 0 || !any_permutable) { - if (any_permutable < 0) - p = isl_printer_free(p); - else - p = print_cpu(p, scop, options); - isl_schedule_free(schedule); - } else { - const int create_to_from_device = 1; - schedule = map_to_device(gen, schedule, create_to_from_device); - gen->tree = generate_code(gen, schedule); - p = ppcg_set_macro_names(p); - p = ppcg_print_exposed_declarations(p, prog->scop); - p = gen->print(p, gen->prog, gen->tree, &gen->types, - gen->print_user); - isl_ast_node_free(gen->tree); - } - - gpu_prog_free(prog); - - return p; -} - -/* Wrapper around generate for use as a ppcg_transform callback. - */ -static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p, - struct ppcg_scop *scop, void *user) -{ - struct gpu_gen *gen = user; - - return generate(p, gen, scop, gen->options); -} - -/* Transform the code in the file called "input" by replacing - * all scops by corresponding GPU code and write the results to "out". - */ -int generate_gpu(isl_ctx *ctx, const char *input, FILE *out, - struct ppcg_options *options, - __isl_give isl_printer *(*print)(__isl_take isl_printer *p, - struct gpu_prog *prog, __isl_keep isl_ast_node *tree, - struct gpu_types *types, void *user), void *user) -{ - struct gpu_gen gen; - int r; - int i; - - gen.ctx = ctx; - gen.sizes = extract_sizes_from_str(ctx, options->sizes); - gen.options = options; - gen.kernel_id = 0; - gen.print = print; - gen.print_user = user; - gen.types.n = 0; - gen.types.name = NULL; - - if (options->debug->dump_sizes) { - isl_space *space = isl_space_params_alloc(ctx, 0); - gen.used_sizes = isl_union_map_empty(space); - } - - r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen); - - if (options->debug->dump_sizes) { - isl_union_map_dump(gen.used_sizes); - isl_union_map_free(gen.used_sizes); - } - - isl_union_map_free(gen.sizes); - for (i = 0; i < gen.types.n; ++i) - free(gen.types.name[i]); - free(gen.types.name); - - return r; -} - -/* Compute the set of inner array elements that may have their values - * preserved by "prog". In particular, collect the array elements of - * arrays that are not local to "prog" and remove those elements that - * are definitely killed or definitely written by "prog". - */ -__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog) -{ - int i; - isl_union_set *may_persist, *killed; - isl_union_map *must_kill; - - may_persist = isl_union_set_empty(isl_set_get_space(prog->context)); - for (i = 0; i < prog->n_array; ++i) { - isl_set *extent; - - if (prog->array[i].local) - continue; - - extent = isl_set_copy(prog->array[i].extent); - may_persist = isl_union_set_add_set(may_persist, extent); - } - - may_persist = isl_union_set_intersect_params(may_persist, - isl_set_copy(prog->context)); - may_persist = isl_union_set_apply(may_persist, - isl_union_map_copy(prog->to_inner)); - must_kill = isl_union_map_copy(prog->tagged_must_kill); - killed = isl_union_map_range(must_kill); - must_kill = isl_union_map_copy(prog->must_write); - killed = isl_union_set_union(killed, isl_union_map_range(must_kill)); - - may_persist = isl_union_set_subtract(may_persist, killed); - return may_persist; -} - -struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop) -{ - struct gpu_prog *prog; - isl_space *space; - isl_map *id; - - if (!scop) - return NULL; - - prog = isl_calloc_type(ctx, struct gpu_prog); - assert(prog); - - prog->ctx = ctx; - prog->scop = scop; - prog->context = isl_set_copy(scop->context); - prog->n_stmts = scop->pet->n_stmt; - prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet); - prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer); - space = isl_union_map_get_space(prog->any_to_outer); - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, 1); - space = isl_space_map_from_set(space); - id = isl_map_identity(space); - prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id); - prog->stmts = extract_stmts(ctx, scop, prog->any_to_outer); - prog->read = isl_union_map_copy(scop->reads); - prog->may_write = isl_union_map_copy(scop->may_writes); - prog->must_write = isl_union_map_copy(scop->must_writes); - prog->tagged_must_kill = isl_union_map_copy(scop->tagged_must_kills); - prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet); - prog->to_outer = isl_union_map_copy(prog->to_inner); - prog->to_outer = isl_union_map_reverse(prog->to_outer); - - if (!prog->stmts) - return gpu_prog_free(prog); - - if (collect_array_info(prog) < 0) - return gpu_prog_free(prog); - prog->may_persist = compute_may_persist(prog); - - return prog; -} - -void *gpu_prog_free(struct gpu_prog *prog) -{ - if (!prog) - return NULL; - free_array_info(prog); - free_stmts(prog->stmts, prog->n_stmts); - isl_union_map_free(prog->any_to_outer); - isl_union_map_free(prog->to_outer); - isl_union_map_free(prog->to_inner); - isl_union_map_free(prog->read); - isl_union_map_free(prog->may_write); - isl_union_map_free(prog->must_write); - isl_union_map_free(prog->tagged_must_kill); - isl_union_map_free(prog->array_order); - isl_union_set_free(prog->may_persist); - isl_set_free(prog->context); - free(prog); - return NULL; -} diff --git a/polly/lib/External/ppcg/gpu_array_tile.h b/polly/lib/External/ppcg/gpu_array_tile.h deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_array_tile.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef GPU_ARRAY_TILE_H -#define GPU_ARRAY_TILE_H - -#include -#include -#include - -/* The fields stride and shift only contain valid information - * if shift != NULL. - * If so, they express that current index is such that if you add shift, - * then the result is always a multiple of stride. - * Let D represent the initial tile->depth dimensions of the computed schedule. - * The spaces of "lb" and "shift" are of the form - * - * D -> [b] - */ -struct gpu_array_bound { - isl_val *size; - isl_aff *lb; - - isl_val *stride; - isl_aff *shift; -}; - -/* A tile of an outer array. - * - * requires_unroll is set if the schedule dimensions that are mapped - * to threads need to be unrolled for this (private) tile to be used. - * - * "depth" reflects the number of schedule dimensions that affect the tile. - * The copying into and/or out of the tile is performed at that depth. - * - * n is the dimension of the array. - * bound is an array of size "n" representing the lower bound - * and size for each index. - * - * tiling maps a tile in the global array to the corresponding - * shared/private memory tile and is of the form - * - * { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] } - * - * where D represents the initial "depth" dimensions - * of the computed schedule. - */ -struct gpu_array_tile { - isl_ctx *ctx; - int requires_unroll; - int depth; - int n; - struct gpu_array_bound *bound; - isl_multi_aff *tiling; -}; - -struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index); -struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile); - -__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile); - -#endif diff --git a/polly/lib/External/ppcg/gpu_array_tile.c b/polly/lib/External/ppcg/gpu_array_tile.c deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_array_tile.c +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include - -#include "gpu_array_tile.h" - -struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile) -{ - int j; - - if (!tile) - return NULL; - - for (j = 0; j < tile->n; ++j) { - isl_val_free(tile->bound[j].size); - isl_val_free(tile->bound[j].stride); - isl_aff_free(tile->bound[j].lb); - isl_aff_free(tile->bound[j].shift); - } - free(tile->bound); - isl_multi_aff_free(tile->tiling); - free(tile); - - return NULL; -} - -/* Create a gpu_array_tile for an array of dimension "n_index". - */ -struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index) -{ - int i; - struct gpu_array_tile *tile; - - tile = isl_calloc_type(ctx, struct gpu_array_tile); - if (!tile) - return NULL; - - tile->ctx = ctx; - tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index); - if (!tile->bound) - return gpu_array_tile_free(tile); - - tile->n = n_index; - - for (i = 0; i < n_index; ++i) { - tile->bound[i].size = NULL; - tile->bound[i].lb = NULL; - tile->bound[i].stride = NULL; - tile->bound[i].shift = NULL; - } - - return tile; -} - -/* Compute the size of the tile specified by "tile" - * in number of elements and return the result. - */ -__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile) -{ - int i; - isl_val *size; - - if (!tile) - return NULL; - - size = isl_val_one(tile->ctx); - - for (i = 0; i < tile->n; ++i) - size = isl_val_mul(size, isl_val_copy(tile->bound[i].size)); - - return size; -} diff --git a/polly/lib/External/ppcg/gpu_group.h b/polly/lib/External/ppcg/gpu_group.h deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_group.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef GPU_GROUP_H -#define GPU_GROUP_H - -#include -#include "gpu.h" - -/* A group of array references in a kernel that should be handled together. - * If private_tile is not NULL, then it is mapped to registers. - * Otherwise, if shared_tile is not NULL, it is mapped to shared memory. - * Otherwise, it is accessed from global memory. - * Note that if both private_tile and shared_tile are set, then shared_tile - * is only used inside group_common_shared_memory_tile. - */ -struct gpu_array_ref_group { - /* The references in this group access this local array. */ - struct gpu_local_array_info *local_array; - /* This is the corresponding array. */ - struct gpu_array_info *array; - /* Position of this group in the list of reference groups of array. */ - int nr; - - /* The following fields are use during the construction of the groups. - * access is the combined access relation relative to the private - * memory tiling. In particular, the domain of the map corresponds - * to the first thread_depth dimensions of the kernel schedule. - * write is set if any access in the group is a write. - * exact_write is set if all writes are definite writes. - * slice is set if there is at least one access in the group - * that refers to more than one element - * "min_depth" is the minimum of the tile depths and thread_depth. - */ - isl_map *access; - int write; - int exact_write; - int slice; - int min_depth; - - /* The shared memory tile, NULL if none. */ - struct gpu_array_tile *shared_tile; - - /* The private memory tile, NULL if none. */ - struct gpu_array_tile *private_tile; - - /* References in this group; point to elements of a linked list. */ - int n_ref; - struct gpu_stmt_access **refs; -}; - -int gpu_group_references(struct ppcg_kernel *kernel, - __isl_keep isl_schedule_node *node); - -__isl_give isl_printer *gpu_array_ref_group_print_name( - struct gpu_array_ref_group *group, __isl_take isl_printer *p); -void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group); -__isl_give isl_union_map *gpu_array_ref_group_access_relation( - struct gpu_array_ref_group *group, int read, int write); -int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group); -enum ppcg_group_access_type gpu_array_ref_group_type( - struct gpu_array_ref_group *group); -struct gpu_array_tile *gpu_array_ref_group_tile( - struct gpu_array_ref_group *group); -struct gpu_array_ref_group *gpu_array_ref_group_free( - struct gpu_array_ref_group *group); - -#endif diff --git a/polly/lib/External/ppcg/gpu_group.c b/polly/lib/External/ppcg/gpu_group.c deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_group.c +++ /dev/null @@ -1,1828 +0,0 @@ -/* - * Copyright 2010-2011 INRIA Saclay - * Copyright 2012-2014 Ecole Normale Superieure - * Copyright 2015 Sven Verdoolaege - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, - * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, - * 91893 Orsay, France - * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France - */ - -#include -#include - -#include "gpu_array_tile.h" -#include "gpu_group.h" -#include "gpu_tree.h" -#include "schedule.h" - -/* Print the name of the local copy of a given group of array references. - */ -__isl_give isl_printer *gpu_array_ref_group_print_name( - struct gpu_array_ref_group *group, __isl_take isl_printer *p) -{ - int global = 0; - enum ppcg_group_access_type type; - - type = gpu_array_ref_group_type(group); - if (type == ppcg_access_private) - p = isl_printer_print_str(p, "private_"); - else if (type == ppcg_access_shared) - p = isl_printer_print_str(p, "shared_"); - else - global = 1; - p = isl_printer_print_str(p, group->array->name); - if (!global && group->local_array->n_group > 1) { - p = isl_printer_print_str(p, "_"); - p = isl_printer_print_int(p, group->nr); - } - - return p; -} - -/* Return the union of all read (read = 1) and/or write (write = 1) - * access relations in the group. - */ -__isl_give isl_union_map *gpu_array_ref_group_access_relation( - struct gpu_array_ref_group *group, int read, int write) -{ - int i; - isl_union_map *access; - - access = isl_union_map_empty(isl_map_get_space(group->access)); - for (i = 0; i < group->n_ref; ++i) { - isl_map *map_i; - - if (!((read && group->refs[i]->read) || - (write && group->refs[i]->write))) - continue; - map_i = isl_map_copy(group->refs[i]->access); - access = isl_union_map_union(access, - isl_union_map_from_map(map_i)); - } - - return access; -} - -/* Should this array reference group be mapped to private, shared or global - * memory? - * If we have computed both a private and a shared tile, then - * the tile with the smallest depth is used. If both have the same depth, - * then the private tile is used. - */ -enum ppcg_group_access_type gpu_array_ref_group_type( - struct gpu_array_ref_group *group) -{ - if (group->private_tile && group->shared_tile && - group->shared_tile->depth < group->private_tile->depth) - return ppcg_access_shared; - if (group->private_tile) - return ppcg_access_private; - if (group->shared_tile) - return ppcg_access_shared; - return ppcg_access_global; -} - - -/* Return the effective gpu_array_tile associated to "group" or - * NULL if there is no such gpu_array_tile. - */ -struct gpu_array_tile *gpu_array_ref_group_tile( - struct gpu_array_ref_group *group) -{ - switch (gpu_array_ref_group_type(group)) { - case ppcg_access_global: - return NULL; - case ppcg_access_shared: - return group->shared_tile; - case ppcg_access_private: - return group->private_tile; - } -} - -/* Does the tile associated to "group" require unrolling of the schedule - * dimensions mapped to threads? - * Note that this can only happen for private tiles. - */ -int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group) -{ - struct gpu_array_tile *tile; - - tile = gpu_array_ref_group_tile(group); - if (!tile) - return 0; - return tile->requires_unroll; -} - -/* Given a constraint - * - * a(p,i) + j = g f(e) - * - * or -a(p,i) - j = g f(e) if sign < 0, - * store a(p,i) in bound->shift and g (stride) in bound->stride. - * a(p,i) is assumed to be an expression in only the parameters - * and the input dimensions. - */ -static void extract_stride(__isl_keep isl_constraint *c, - struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign) -{ - int i; - isl_val *v; - isl_space *space; - unsigned nparam; - unsigned nvar; - isl_aff *aff; - - isl_val_free(bound->stride); - bound->stride = isl_val_copy(stride); - - space = isl_constraint_get_space(c); - space = isl_space_domain(space); - - nparam = isl_space_dim(space, isl_dim_param); - nvar = isl_space_dim(space, isl_dim_set); - - v = isl_constraint_get_constant_val(c); - if (sign < 0) - v = isl_val_neg(v); - aff = isl_aff_zero_on_domain(isl_local_space_from_space(space)); - aff = isl_aff_set_constant_val(aff, v); - - for (i = 0; i < nparam; ++i) { - if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1)) - continue; - v = isl_constraint_get_coefficient_val(c, isl_dim_param, i); - if (sign < 0) - v = isl_val_neg(v); - aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v); - } - - for (i = 0; i < nvar; ++i) { - if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1)) - continue; - v = isl_constraint_get_coefficient_val(c, isl_dim_in, i); - if (sign < 0) - v = isl_val_neg(v); - aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v); - } - - bound->shift = aff; -} - -/* Given an equality constraint of a map with a single output dimension j, - * check if the constraint is of the form - * - * a(p,i) + j = g f(e) - * - * with a(p,i) an expression in the parameters and input dimensions - * and f(e) an expression in the existentially quantified variables. - * If so, and if g is larger than any such g from a previously considered - * constraint, then call extract_stride to record the stride information - * in bound. - */ -static isl_stat check_stride_constraint(__isl_take isl_constraint *c, - void *user) -{ - int i; - isl_ctx *ctx; - isl_val *v; - unsigned n_div; - struct gpu_array_bound *bound = user; - - ctx = isl_constraint_get_ctx(c); - n_div = isl_constraint_dim(c, isl_dim_div); - v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0); - - if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) { - int s = isl_val_sgn(v); - isl_val *stride = isl_val_zero(ctx); - - isl_val_free(v); - for (i = 0; i < n_div; ++i) { - v = isl_constraint_get_coefficient_val(c, - isl_dim_div, i); - stride = isl_val_gcd(stride, v); - } - if (!isl_val_is_zero(stride) && - isl_val_gt(stride, bound->stride)) - extract_stride(c, bound, stride, s); - - isl_val_free(stride); - } else - isl_val_free(v); - - isl_constraint_free(c); - return isl_stat_ok; -} - -/* Given contraints on an array index i, check if we can find - * a shift a(p) and a stride g such that - * - * a(p) + i = 0 mod g - * - * If so, record the information in bound and apply the mapping - * i -> (i + a(p))/g to the array index in bounds and return - * the new constraints. - * If not, simply return the original constraints. - * - * If bounds is a subset of the space - * - * D -> i - * - * then the bound recorded in bound->shift is of the form - * - * D -> s(D) - * - * with s(D) equal to a(p) above. - * Next, we construct a mapping of the form - * - * [D -> i] -> [D -> (i + S(D))/g] - * - * This mapping is computed as follows. - * We first introduce "i" in the domain through precomposition - * with [D -> i] -> D obtaining - * - * [D -> i] -> s(D) - * - * Adding [D -> i] -> i produces - * - * [D -> i] -> i + s(D) - * - * and the domain product with [D -> i] -> D yields - * - * [D -> i] -> [D -> i + s(D)] - * - * Composition with [D -> i] -> [D -> i/g] gives the desired result. - */ -static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound, - __isl_take isl_basic_map *bounds) -{ - isl_space *space; - isl_basic_map *hull; - isl_basic_map *shift, *id, *bmap, *scale; - isl_basic_set *bset; - isl_aff *aff; - - bound->stride = NULL; - - hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds)); - - isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound); - - isl_basic_map_free(hull); - - if (!bound->stride) - return bounds; - - shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift)); - space = isl_basic_map_get_space(bounds); - bmap = isl_basic_map_domain_map(isl_basic_map_universe(space)); - shift = isl_basic_map_apply_range(bmap, shift); - space = isl_basic_map_get_space(bounds); - id = isl_basic_map_range_map(isl_basic_map_universe(space)); - shift = isl_basic_map_sum(id, shift); - space = isl_basic_map_get_space(bounds); - id = isl_basic_map_domain_map(isl_basic_map_universe(space)); - shift = isl_basic_map_range_product(id, shift); - - space = isl_space_domain(isl_basic_map_get_space(bounds)); - id = isl_basic_map_identity(isl_space_map_from_set(space)); - space = isl_space_range(isl_basic_map_get_space(bounds)); - aff = isl_aff_zero_on_domain(isl_local_space_from_space(space)); - aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1); - aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride)); - scale = isl_basic_map_from_aff(aff); - scale = isl_basic_map_product(id, scale); - - bmap = isl_basic_map_apply_range(shift, scale); - bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap); - bounds = isl_basic_set_unwrap(bset); - - return bounds; -} - -/* Data used in compute_array_dim_size and compute_size_in_direction. - * - * pos is the position of the variable representing the array index, - * i.e., the variable for which want to compute the size. This variable - * is also the last variable in the set. - */ -struct gpu_size_info { - isl_basic_set *bset; - struct gpu_array_bound *bound; - int pos; -}; - -/* Given a constraint from the basic set describing the bounds on - * an array index, check if it is a lower bound, say m i >= b(x), and, - * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant - * upper bound. If so, and if this bound is smaller than any bound - * derived from earlier constraints, set the size to this bound on - * the expression and the lower bound to ceil(b(x)/m). - */ -static isl_stat compute_size_in_direction(__isl_take isl_constraint *c, - void *user) -{ - struct gpu_size_info *size = user; - unsigned nparam; - unsigned n_div; - isl_val *v; - isl_aff *aff; - isl_aff *lb; - - nparam = isl_basic_set_dim(size->bset, isl_dim_param); - n_div = isl_constraint_dim(c, isl_dim_div); - - if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) || - !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) { - isl_constraint_free(c); - return isl_stat_ok; - } - - aff = isl_constraint_get_bound(c, isl_dim_set, size->pos); - aff = isl_aff_ceil(aff); - - lb = isl_aff_copy(aff); - - aff = isl_aff_neg(aff); - aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1); - - v = isl_basic_set_max_val(size->bset, aff); - isl_aff_free(aff); - - if (isl_val_is_int(v)) { - v = isl_val_add_ui(v, 1); - if (!size->bound->size || isl_val_lt(v, size->bound->size)) { - isl_val_free(size->bound->size); - size->bound->size = isl_val_copy(v); - lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1); - isl_aff_free(size->bound->lb); - size->bound->lb = isl_aff_copy(lb); - } - } - isl_val_free(v); - isl_aff_free(lb); - - isl_constraint_free(c); - - return isl_stat_ok; -} - -/* Given a basic map "bounds" that maps parameters and input dimensions - * to a single output dimension, look for an expression in the parameters - * and input dimensions such that the range of the output dimension shifted - * by this expression is a constant. - * - * In particular, we currently only consider lower bounds on the output - * dimension as candidate expressions. - */ -static int compute_array_dim_size(struct gpu_array_bound *bound, - __isl_take isl_basic_map *bounds) -{ - struct gpu_size_info size; - - bounds = isl_basic_map_detect_equalities(bounds); - bounds = check_stride(bound, bounds); - - bound->size = NULL; - bound->lb = NULL; - - size.bound = bound; - size.pos = isl_basic_map_dim(bounds, isl_dim_in); - size.bset = isl_basic_map_wrap(bounds); - size.bset = isl_basic_set_flatten(size.bset); - size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset)); - isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction, - &size); - isl_basic_set_free(size.bset); - - return bound->size ? 0 : -1; -} - -/* Check if we can find a memory tile for the given array - * based on the given accesses, and if so, put the results in "tile". - * - * We project the accesses on each index in turn and look for a parametric - * offset such that the size is constant. - * - * tile->depth is initialized to the input dimension of the computed bounds. - */ -static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile) -{ - int i; - - tile->depth = isl_map_dim(access, isl_dim_in); - - for (i = 0; i < tile->n; ++i) { - isl_map *access_i; - isl_basic_map *hull; - - access_i = isl_map_copy(access); - access_i = isl_map_project_out(access_i, isl_dim_out, 0, i); - access_i = isl_map_project_out(access_i, isl_dim_out, - 1, tile->n - (i + 1)); - access_i = isl_map_compute_divs(access_i); - hull = isl_map_simple_hull(access_i); - if (compute_array_dim_size(&tile->bound[i], hull) < 0) - return 0; - } - - return 1; -} - -/* Internal data structure for gpu_group_references. - * - * scop represents the input scop. - * kernel_depth is the schedule depth where the kernel launch will - * be introduced, i.e., it is the depth of the band that is mapped - * to blocks. - * shared_depth is the schedule depth at which the copying to/from - * shared memory is computed. The copy operation may then - * later be hoisted to a higher level. - * thread_depth is the schedule depth where the thread mark is located, - * i.e., it is the depth of the band that is mapped to threads and also - * the schedule depth at which the copying to/from private memory - * is computed. The copy operation may then later be hoisted to - * a higher level. - * n_thread is the number of schedule dimensions in the band that - * is mapped to threads. - * privatization lives in the range of thread_sched (i.e., it is - * of dimension thread_depth + n_thread) and encodes the mapping - * to thread identifiers (as parameters). - * host_sched contains the kernel_depth dimensions of the host schedule. - * shared_sched contains the first shared_depth dimensions of the - * kernel schedule. - * copy_sched contains the first thread_depth dimensions of the - * kernel schedule. - * thread_sched contains the first (thread_depth + n_thread) dimensions - * of the kernel schedule. - * full_sched is a union_map representation of the entire kernel schedule. - * The schedules are all formulated in terms of the original statement - * instances, i.e., those that appear in the domains of the access - * relations. - */ -struct gpu_group_data { - struct ppcg_scop *scop; - int kernel_depth; - int shared_depth; - int thread_depth; - int n_thread; - isl_set *privatization; - isl_union_map *host_sched; - isl_union_map *shared_sched; - isl_union_map *copy_sched; - isl_union_map *thread_sched; - isl_union_map *full_sched; -}; - -/* Construct a map from domain_space to domain_space that increments - * the dimension at position "pos" and leaves all other dimensions - * constant. - */ -static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos) -{ - isl_space *space; - isl_aff *aff; - isl_multi_aff *next; - - space = isl_space_map_from_set(domain_space); - next = isl_multi_aff_identity(space); - aff = isl_multi_aff_get_aff(next, pos); - aff = isl_aff_add_constant_si(aff, 1); - next = isl_multi_aff_set_aff(next, pos, aff); - - return isl_map_from_multi_aff(next); -} - -/* Check if the given access is coalesced (or if there is no point - * in trying to coalesce the access by mapping the array to shared memory). - * That is, check whether incrementing the dimension that will get - * wrapped over the last thread index results in incrementing - * the last array index. - * - * If no two consecutive array elements are ever accessed by "access", - * then mapping the corresponding array to shared memory will not - * improve coalescing. In fact, the copying will likely be performed - * by a single thread. Consider the access as coalesced such that - * the caller will not try and map the array to shared memory just - * to improve coalescing. - * - * This function is only called for access relations without reuse and - * kernels with at least one thread identifier. - */ -static int access_is_coalesced(struct gpu_group_data *data, - __isl_keep isl_union_map *access) -{ - int dim; - isl_space *space; - isl_set *accessed; - isl_map *access_map; - isl_map *next_thread_x; - isl_map *next_element; - isl_map *map; - int coalesced, empty; - - access = isl_union_map_copy(access); - access = isl_union_map_apply_domain(access, - isl_union_map_copy(data->full_sched)); - access_map = isl_map_from_union_map(access); - - space = isl_map_get_space(access_map); - space = isl_space_range(space); - dim = isl_space_dim(space, isl_dim_set); - if (dim == 0) - next_element = isl_map_empty(isl_space_map_from_set(space)); - else - next_element = next(space, dim - 1); - - accessed = isl_map_range(isl_map_copy(access_map)); - map = isl_map_copy(next_element); - map = isl_map_intersect_domain(map, isl_set_copy(accessed)); - map = isl_map_intersect_range(map, accessed); - empty = isl_map_is_empty(map); - isl_map_free(map); - - if (empty < 0 || empty) { - isl_map_free(next_element); - isl_map_free(access_map); - return empty; - } - - space = isl_map_get_space(access_map); - space = isl_space_domain(space); - next_thread_x = next(space, data->thread_depth + data->n_thread - 1); - - map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map)); - map = isl_map_apply_range(map, access_map); - - coalesced = isl_map_is_subset(map, next_element); - - isl_map_free(next_element); - isl_map_free(map); - - return coalesced; -} - -/* Replace the host schedule dimensions in the access relation "access" - * by parameters, so that they are treated as fixed when checking for reuse - * (within a kernel) or whether two consecutive elements are accessed - * (within a kernel). - */ -static __isl_give isl_union_map *localize_access(struct gpu_group_data *data, - __isl_take isl_union_map *access) -{ - int n; - isl_space *space; - isl_set *param; - isl_union_map *umap; - isl_id_list *ids; - - umap = isl_union_map_copy(data->host_sched); - space = isl_union_map_get_space(umap); - n = data->kernel_depth; - ids = ppcg_scop_generate_names(data->scop, n, "__ppcg_host_"); - param = parametrization(space, n, 0, ids); - isl_id_list_free(ids); - umap = isl_union_map_intersect_range(umap, - isl_union_set_from_set(param)); - access = isl_union_map_intersect_domain(access, - isl_union_map_domain(umap)); - - return access; -} - -/* Given an access relation in terms of at least data->thread_depth initial - * dimensions of the computed schedule, check if it is bijective for - * fixed values of the first data->thread_depth dimensions. - * We perform this check by equating these dimensions to parameters. - */ -static int access_is_bijective(struct gpu_group_data *data, - __isl_keep isl_map *access) -{ - int res; - int dim; - isl_set *par; - isl_space *space; - isl_id_list *ids; - - access = isl_map_copy(access); - space = isl_space_params(isl_map_get_space(access)); - ids = ppcg_scop_generate_names(data->scop, data->thread_depth, "s"); - dim = isl_map_dim(access, isl_dim_in); - par = parametrization(space, dim, 0, ids); - isl_id_list_free(ids); - access = isl_map_intersect_domain(access, par); - res = isl_map_is_bijective(access); - isl_map_free(access); - - return res; -} - -/* Compute the number of outer schedule tile dimensions that affect - * the offset of "tile". - * If there is no such dimension, then return the index - * of the first kernel dimension, i.e., data->kernel_depth. - */ -static int compute_tile_depth(struct gpu_group_data *data, - struct gpu_array_tile *tile) -{ - int i, j; - - for (j = tile->depth - 1; j >= data->kernel_depth; --j) { - for (i = 0; i < tile->n; ++i) { - isl_aff *lb; - isl_aff *shift; - - lb = tile->bound[i].lb; - if (isl_aff_involves_dims(lb, isl_dim_in, j, 1)) - break; - - shift = tile->bound[i].shift; - if (!shift) - continue; - if (isl_aff_involves_dims(shift, isl_dim_in, j, 1)) - break; - } - if (i < tile->n) - break; - } - - return ++j; -} - -/* Return the lowest depth between data->kernel_depth and data->thread_depth - * at which every array element accessed through "acc" is accessed - * by a single thread. The input dimension of "acc" is - * data->thread_depth + data->n_thread, where the final data->n_thread - * dimensions are those that will be mapped to threads. - * If the values for these dimensions are uniquely determined - * by the array index and a given number of outer dimensions, then - * there is only one thread accessing that array element within those - * outer dimensions. - * - * The input space of "acc" is first split up, such that it has the form - * - * [O -> T] -> A - * - * with O the outer dimensions, T the dimensions that will be mapped to threads - * and A the array index. - * - * Then the positions of T and A are interchanged to simplify the test - * whether T uniquely depends on O and A. - * In particular, the above access relation is first combined with - * - * [O -> T] -> T - * - * to form - * - * [O -> T] -> [A -> T] - * - * from which - * - * O -> [A -> T] - * - * is extracted, which is then uncurried to - * - * [O -> A] -> T - * - * Finally, the final dimensions of O are projected out one by one - * until T is no longer uniquely determined by A and the remaining - * dimensions in O. The value returned is that of the last dimension - * that was successfully projected out. - * Note that there is no need to test whether [O -> A] -> T itself - * is single-valued as that was already tested in access_is_bijective. - */ -static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data, - __isl_keep isl_map *acc) -{ - int i; - isl_space *space; - isl_map *map; - isl_bool sv; - - if (data->thread_depth == data->kernel_depth) - return data->thread_depth; - - acc = isl_map_copy(acc); - - space = isl_map_get_space(acc); - space = isl_space_params(space); - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, data->thread_depth); - space = isl_space_from_domain(space); - space = isl_space_add_dims(space, isl_dim_out, data->n_thread); - space = isl_space_wrap(space); - map = isl_set_flatten_map(isl_set_universe(space)); - acc = isl_map_apply_range(map, acc); - - space = isl_space_domain(isl_map_get_space(acc)); - map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space))); - acc = isl_map_range_product(acc, map); - acc = isl_map_domain_factor_domain(acc); - acc = isl_map_uncurry(acc); - - for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) { - acc = isl_map_project_out(acc, isl_dim_in, i, 1); - sv = isl_map_is_single_valued(acc); - if (sv < 0) - return -1; - if (!sv) - break; - } - - isl_map_free(acc); - - return ++i; -} - -/* Adjust the fields of "tile" to reflect the new input dimension "depth". - * The dimension beyond "depth" are assumed not to affect the tile, - * so they can simply be dropped. - */ -static int tile_adjust_depth(struct gpu_array_tile *tile, int depth) -{ - int i; - - if (tile->depth == depth) - return 0; - - for (i = 0; i < tile->n; ++i) { - tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb, - isl_dim_in, depth, tile->depth - depth); - if (!tile->bound[i].lb) - return -1; - if (!tile->bound[i].shift) - continue; - tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift, - isl_dim_in, depth, tile->depth - depth); - if (!tile->bound[i].shift) - return -1; - } - - tile->depth = depth; - - return 0; -} - -/* Determine the number of schedule dimensions that affect the offset of the - * shared or private tile "tile" and store the result in tile->depth, with - * a lower bound of data->kernel_depth. - * Also adjust the fields of the tile to only refer to the tile->depth - * outer schedule dimensions. - */ -static isl_stat tile_set_depth(struct gpu_group_data *data, - struct gpu_array_tile *tile) -{ - if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0) - return isl_stat_error; - - return isl_stat_ok; -} - -/* Determine the number of schedule dimensions that affect the offset of the - * shared tile and store the minimum of the private and shared tile depth - * in group->min_depth, with a lower bound of data->kernel_depth. - * If there is no tile defined on the array reference group, - * then set group->min_depth to data->thread_depth. - */ -static int set_depth(struct gpu_group_data *data, - struct gpu_array_ref_group *group) -{ - group->min_depth = data->thread_depth; - - if (group->private_tile) { - if (group->private_tile->depth < group->min_depth) - group->min_depth = group->private_tile->depth; - } - if (group->shared_tile) { - if (tile_set_depth(data, group->shared_tile) < 0) - return -1; - if (group->shared_tile->depth < group->min_depth) - group->min_depth = group->shared_tile->depth; - } - - return 0; -} - -/* Fill up the groups array with singleton groups, i.e., one group - * per reference, initializing the array, access, write, n_ref and refs fields. - * In particular the access field is initialized to the scheduled - * access relation of the array reference. - * - * Return the number of elements initialized, i.e., the number of - * active references in the current kernel. - */ -static int populate_array_references(struct gpu_local_array_info *local, - struct gpu_array_ref_group **groups, struct gpu_group_data *data) -{ - int i; - int n; - isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched); - - n = 0; - for (i = 0; i < local->array->n_ref; ++i) { - isl_union_map *umap; - isl_map *map; - struct gpu_array_ref_group *group; - struct gpu_stmt_access *access = local->array->refs[i]; - - map = isl_map_copy(access->access); - umap = isl_union_map_from_map(map); - umap = isl_union_map_apply_domain(umap, - isl_union_map_copy(data->copy_sched)); - - if (isl_union_map_is_empty(umap)) { - isl_union_map_free(umap); - continue; - } - - map = isl_map_from_union_map(umap); - map = isl_map_detect_equalities(map); - - group = isl_calloc_type(ctx, struct gpu_array_ref_group); - if (!group) - return -1; - group->local_array = local; - group->array = local->array; - group->access = map; - group->write = access->write; - group->exact_write = access->exact_write; - group->slice = access->n_index < local->array->n_index; - group->refs = &local->array->refs[i]; - group->n_ref = 1; - - groups[n++] = group; - } - - return n; -} - -/* If group->n_ref == 1, then group->refs was set by - * populate_array_references to point directly into - * group->array->refs and should not be freed. - * If group->n_ref > 1, then group->refs was set by join_groups - * to point to a newly allocated array. - */ -struct gpu_array_ref_group *gpu_array_ref_group_free( - struct gpu_array_ref_group *group) -{ - if (!group) - return NULL; - gpu_array_tile_free(group->shared_tile); - gpu_array_tile_free(group->private_tile); - isl_map_free(group->access); - if (group->n_ref > 1) - free(group->refs); - free(group); - return NULL; -} - -/* Check if the access relations of group1 and group2 overlap within - * copy_sched. - */ -static int accesses_overlap(struct gpu_array_ref_group *group1, - struct gpu_array_ref_group *group2) -{ - int disjoint; - - disjoint = isl_map_is_disjoint(group1->access, group2->access); - if (disjoint < 0) - return -1; - - return !disjoint; -} - -/* Combine the given two groups into a single group, containing - * the references of both groups. - */ -static struct gpu_array_ref_group *join_groups( - struct gpu_array_ref_group *group1, - struct gpu_array_ref_group *group2) -{ - int i; - isl_ctx *ctx; - struct gpu_array_ref_group *group; - - if (!group1 || !group2) - return NULL; - - ctx = isl_map_get_ctx(group1->access); - group = isl_calloc_type(ctx, struct gpu_array_ref_group); - if (!group) - return NULL; - group->local_array = group1->local_array; - group->array = group1->array; - group->access = isl_map_union(isl_map_copy(group1->access), - isl_map_copy(group2->access)); - group->write = group1->write || group2->write; - group->exact_write = group1->exact_write && group2->exact_write; - group->slice = group1->slice || group2->slice; - group->n_ref = group1->n_ref + group2->n_ref; - group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *, - group->n_ref); - if (!group->refs) - return gpu_array_ref_group_free(group); - for (i = 0; i < group1->n_ref; ++i) - group->refs[i] = group1->refs[i]; - for (i = 0; i < group2->n_ref; ++i) - group->refs[group1->n_ref + i] = group2->refs[i]; - - return group; -} - -/* Combine the given two groups into a single group and free - * the original two groups. - */ -static struct gpu_array_ref_group *join_groups_and_free( - struct gpu_array_ref_group *group1, - struct gpu_array_ref_group *group2) -{ - struct gpu_array_ref_group *group; - - group = join_groups(group1, group2); - gpu_array_ref_group_free(group1); - gpu_array_ref_group_free(group2); - return group; -} - -/* Report that the array reference group with the given access relation - * is not mapped to shared memory in the given kernel because - * it does not exhibit any reuse and is considered to be coalesced. - */ -static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel, - __isl_keep isl_union_map *access) -{ - isl_ctx *ctx; - isl_printer *p; - - ctx = isl_union_map_get_ctx(access); - p = isl_printer_to_file(ctx, stdout); - p = isl_printer_print_str(p, "Array reference group "); - p = isl_printer_print_union_map(p, access); - p = isl_printer_print_str(p, - " not considered for mapping to shared memory in kernel"); - p = isl_printer_print_int(p, kernel->id); - p = isl_printer_print_str(p, - " because it exhibits no reuse and is considered to be coalesced"); - p = isl_printer_end_line(p); - isl_printer_free(p); -} - -/* Given an access relation in terms of the data->thread_depth initial - * dimensions of the computed schedule and the thread identifiers - * (as parameters), check if the use of the corresponding private tile - * requires unrolling. - * - * If we are creating a private tile because we are forced to, - * then no unrolling is required. - * Otherwise we check if "access" is bijective and unrolling - * is required if it is not. Note that the access relation - * has already been determined to be bijective before the introduction - * of the thread identifiers and the removal of the schedule dimensions - * that are mapped to these threads. If the access relation is no longer - * bijective, then this means that more than one value of one of those - * schedule dimensions is mapped to the same thread and therefore - * unrolling is required. - */ -static int check_requires_unroll(struct gpu_group_data *data, - __isl_keep isl_map *access, int force_private) -{ - int bijective; - - if (force_private) - return 0; - bijective = access_is_bijective(data, access); - if (bijective < 0) - return -1; - return !bijective; -} - -/* Map the domain of "access" to the outer data->shared_depth - * schedule dimensions. When data->shared_depth is equal to - * data->thread_depth, this result is already available in group->access. - */ -static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group, - __isl_keep isl_union_map *access, struct gpu_group_data *data) -{ - isl_union_map *shared; - - if (data->shared_depth == data->thread_depth) - return isl_map_copy(group->access); - - shared = isl_union_map_copy(access); - shared = isl_union_map_apply_domain(shared, - isl_union_map_copy(data->shared_sched)); - return isl_map_from_union_map(shared); -} - -/* Compute the private and/or shared memory tiles for the array - * reference group "group" of array "array". - * Return 0 on success and -1 on error. - * - * If the array is a read-only scalar or if the user requested - * not to use shared or private memory, then we do not need to do anything. - * - * If any reference in the reference group accesses more than one element, - * then we would have to make sure that the layout in shared memory - * is the same as that in global memory. Since we do not handle this yet - * (and it may not even be possible), we refuse to map to private or - * shared memory in such cases. - * - * If the array group involves any may writes (that are not must writes), - * then we would have to make sure that we load the data into shared/private - * memory first in case the data is not written by the kernel - * (but still written back out to global memory). - * Since we don't have any such mechanism at the moment, we don't - * compute shared/private tiles for groups involving may writes. - * - * We only try to compute a shared memory tile if there is any reuse - * or if the access is not coalesced. - * Reuse and coalescing are checked within the given kernel. - * - * For computing a private memory tile, we also require that there is - * some reuse. Moreover, we require that the access is private - * to the thread. That is, we check that any given array element - * is only accessed by a single thread. - * We compute an access relation that maps the outer - * data->thread_depth + data->n_thread schedule dimensions. - * The latter data->n_thread will be mapped to thread identifiers. - * We actually check that those iterators that will be wrapped - * partition the array space. This check is stricter than necessary - * since several iterations may be mapped onto the same thread - * and then they could be allowed to access the same memory elements, - * but our check does not allow this situation. - * - * For private memory tiles, the number of schedule dimensions that - * affect the offset is computed and stored in tile->depth, with - * a lower bound of data->kernel_depth. If this depth is smaller - * than the minimal depth that still ensures that every element - * is accessed by a single thread, then the depth is raised - * to this minimal depth. - * The fields of the tile are then adjusted to only refer to the tile->depth - * outer schedule dimensions. - * - * We also check that the index expression only depends on parallel - * loops. That way, we can move those loops innermost and unroll them. - * Again, we use a test that is stricter than necessary. - * We actually check whether the index expression only depends - * on the iterators that are wrapped over the threads. - * These are necessarily parallel, but there may be more parallel loops. - * - * Combining the injectivity of the first test with the single-valuedness - * of the second test, we simply test for bijectivity. - * - * If the use of the private tile requires unrolling, but some - * of the other arrays are forcibly mapped to private memory, - * then we do not allow the use of this private tile since - * we cannot move the schedule dimensions that need to be unrolled down - * without performing some kind of expansion on those arrays - * that are forcibly mapped to private memory. - * - * If the array is marked force_private, then we bypass all checks - * and assume we can (and should) use registers only. - * - * If it turns out we can (or have to) use registers, we compute - * the private memory tile size using can_tile, after introducing a dependence - * on the thread indices. - */ -static int compute_group_bounds_core(struct ppcg_kernel *kernel, - struct gpu_array_ref_group *group, struct gpu_group_data *data) -{ - isl_ctx *ctx = isl_space_get_ctx(group->array->space); - isl_union_map *access, *local; - int n_index = group->array->n_index; - int no_reuse, coalesced; - isl_map *acc; - int force_private = group->local_array->force_private; - int use_shared = !force_private && kernel->options->use_shared_memory && - data->n_thread > 0; - int use_private = force_private || kernel->options->use_private_memory; - int r = 0; - int requires_unroll; - int unique_depth; - - if (!use_shared && !use_private) - return 0; - if (gpu_array_is_read_only_scalar(group->array)) - return 0; - if (!force_private && !group->exact_write) - return 0; - if (group->slice) - return 0; - - access = gpu_array_ref_group_access_relation(group, 1, 1); - local = localize_access(data, isl_union_map_copy(access)); - no_reuse = isl_union_map_is_injective(local); - if (no_reuse < 0) - r = -1; - if (use_shared && no_reuse) - coalesced = access_is_coalesced(data, local); - isl_union_map_free(local); - - if (r >= 0 && kernel->options->debug->verbose && - use_shared && no_reuse && coalesced) - report_no_reuse_and_coalesced(kernel, access); - - if (use_shared && (!no_reuse || !coalesced)) { - group->shared_tile = gpu_array_tile_create(ctx, - group->array->n_index); - acc = shared_access(group, access, data); - if (!group->shared_tile) - r = -1; - else if (!can_tile(acc, group->shared_tile)) - group->shared_tile = - gpu_array_tile_free(group->shared_tile); - isl_map_free(acc); - } - - if (r < 0 || (!force_private && (!use_private || no_reuse))) { - isl_union_map_free(access); - return r; - } - - access = isl_union_map_apply_domain(access, - isl_union_map_copy(data->thread_sched)); - - acc = isl_map_from_union_map(access); - - if (!force_private && !access_is_bijective(data, acc)) { - isl_map_free(acc); - return 0; - } - - unique_depth = compute_accessed_by_single_thread_depth(data, acc); - - acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization)); - acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth, - data->n_thread); - requires_unroll = check_requires_unroll(data, acc, force_private); - if (unique_depth < 0 || requires_unroll < 0 || - (requires_unroll && kernel->any_force_private)) { - isl_map_free(acc); - return requires_unroll < 0 ? -1 : 0; - } - - group->private_tile = gpu_array_tile_create(ctx, n_index); - if (!group->private_tile) { - isl_map_free(acc); - return -1; - } - group->private_tile->requires_unroll = requires_unroll; - if (!can_tile(acc, group->private_tile)) - group->private_tile = gpu_array_tile_free(group->private_tile); - - isl_map_free(acc); - - if (group->private_tile) { - struct gpu_array_tile *tile = group->private_tile; - int tile_depth = compute_tile_depth(data, tile); - if (tile_depth < unique_depth) - tile_depth = unique_depth; - if (tile_adjust_depth(tile, tile_depth) < 0) - return -1; - } - - if (force_private && !group->private_tile) - isl_die(ctx, isl_error_internal, - "unable to map array reference group to registers", - return -1); - - return 0; -} - -/* Compute the private and/or shared memory tiles for the array - * reference group "group" of array "array" and set the tile depth. - * Return 0 on success and -1 on error. - */ -static int compute_group_bounds(struct ppcg_kernel *kernel, - struct gpu_array_ref_group *group, struct gpu_group_data *data) -{ - if (!group) - return -1; - if (compute_group_bounds_core(kernel, group, data) < 0) - return -1; - if (set_depth(data, group) < 0) - return -1; - - return 0; -} - -/* If two groups have overlapping access relations (as determined by - * the "overlap" function) and if one of them involves a write, - * then merge the two groups into one. - * If "compute_bounds" is set, then call compute_group_bounds - * on the merged groups. - * - * Return the updated number of groups. - * Return -1 on error. - */ -static int group_writes(struct ppcg_kernel *kernel, - int n, struct gpu_array_ref_group **groups, - int (*overlap)(struct gpu_array_ref_group *group1, - struct gpu_array_ref_group *group2), int compute_bounds, - struct gpu_group_data *data) -{ - int i, j; - - for (i = 0; i < n; ++i) { - for (j = n - 1; j > i; --j) { - if (!groups[i]->write && !groups[j]->write) - continue; - - if (!overlap(groups[i], groups[j])) - continue; - - groups[i] = join_groups_and_free(groups[i], groups[j]); - if (j != n - 1) - groups[j] = groups[n - 1]; - groups[n - 1] = NULL; - n--; - - if (!groups[i]) - return -1; - if (compute_bounds && - compute_group_bounds(kernel, groups[i], data) < 0) - return -1; - } - } - - return n; -} - -/* If two groups have overlapping access relations (within the innermost - * loop) and if one of them involves a write, then merge the two groups - * into one. - * - * Return the updated number of groups. - */ -static int group_overlapping_writes(struct ppcg_kernel *kernel, - int n, struct gpu_array_ref_group **groups, - struct gpu_group_data *data) -{ - return group_writes(kernel, n, groups, &accesses_overlap, 0, data); -} - -/* Check if the access relations of group1 and group2 overlap within - * the outermost min(group1->min_depth, group2->min_depth) loops. - */ -static int depth_accesses_overlap(struct gpu_array_ref_group *group1, - struct gpu_array_ref_group *group2) -{ - int depth; - int dim; - int empty; - isl_map *map_i, *map_j, *map; - - depth = group1->min_depth; - if (group2->min_depth < depth) - depth = group2->min_depth; - map_i = isl_map_copy(group1->access); - dim = isl_map_dim(map_i, isl_dim_in); - map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth); - map_j = isl_map_copy(group2->access); - map_j = isl_map_eliminate(map_j, isl_dim_in, depth, dim - depth); - map = isl_map_intersect(map_i, map_j); - empty = isl_map_is_empty(map); - isl_map_free(map); - - return !empty; -} - -/* If two groups have overlapping access relations (within the outer - * depth loops) and if one of them involves a write, - * then merge the two groups into one. - * - * Return the updated number of groups. - */ -static int group_depth_overlapping_writes(struct ppcg_kernel *kernel, - int n, struct gpu_array_ref_group **groups, struct gpu_group_data *data) -{ - return group_writes(kernel, n, groups, &depth_accesses_overlap, 1, - data); -} - -/* Is the size of the tile specified by "tile" smaller than the sum of - * the sizes of the tiles specified by "tile1" and "tile2"? - */ -static int smaller_tile(struct gpu_array_tile *tile, - struct gpu_array_tile *tile1, struct gpu_array_tile *tile2) -{ - int smaller; - isl_val *size, *size1, *size2; - - size = gpu_array_tile_size(tile); - size1 = gpu_array_tile_size(tile1); - size2 = gpu_array_tile_size(tile2); - - size = isl_val_sub(size, size1); - size = isl_val_sub(size, size2); - smaller = isl_val_is_neg(size); - - isl_val_free(size); - - return smaller; -} - -/* Given an initial grouping of array references and shared memory tiles - * for each group that allows for a shared memory tile, merge two groups - * if both have a shared memory tile, the merged group also has - * a shared memory tile and the size of the tile for the merge group - * is smaller than the sum of the tile sizes of the individual groups. - * - * If merging two groups decreases the depth of the tile of - * one or both of the two groups, then we need to check for overlapping - * writes again. - * - * Return the number of groups after merging. - * Return -1 on error. - */ -static int group_common_shared_memory_tile(struct ppcg_kernel *kernel, - struct gpu_array_info *array, int n, - struct gpu_array_ref_group **groups, struct gpu_group_data *data) -{ - int i, j; - int recompute_overlap = 0; - - for (i = 0; i < n; ++i) { - if (!groups[i]->shared_tile) - continue; - for (j = n - 1; j > i; --j) { - struct gpu_array_ref_group *group; - - if (!groups[j]->shared_tile) - continue; - - if (!depth_accesses_overlap(groups[i], groups[j])) - continue; - - group = join_groups(groups[i], groups[j]); - if (compute_group_bounds(kernel, group, data) < 0) { - gpu_array_ref_group_free(group); - return -1; - } - if (!group->shared_tile || - !smaller_tile(group->shared_tile, - groups[i]->shared_tile, - groups[j]->shared_tile)) { - gpu_array_ref_group_free(group); - continue; - } - - if (group->min_depth < groups[i]->min_depth || - group->min_depth < groups[j]->min_depth) - recompute_overlap = 1; - gpu_array_ref_group_free(groups[i]); - gpu_array_ref_group_free(groups[j]); - groups[i] = group; - if (j != n - 1) - groups[j] = groups[n - 1]; - n--; - } - } - - if (recompute_overlap) - n = group_depth_overlapping_writes(kernel, n, groups, data); - return n; -} - -/* Set array->n_group and array->groups to n and groups. - * - * Additionally, set the "nr" field of each group. - */ -static void set_array_groups(struct gpu_local_array_info *array, - int n, struct gpu_array_ref_group **groups) -{ - int i; - - array->n_group = n; - array->groups = groups; - - for (i = 0; i < n; ++i) - groups[i]->nr = i; -} - -/* Combine all groups in "groups" into a single group and return - * the new number of groups (1 or 0 if there were no groups to start with). - */ -static int join_all_groups(int n, struct gpu_array_ref_group **groups) -{ - int i; - - for (i = n - 1; i > 0; --i) { - groups[0] = join_groups_and_free(groups[0], groups[i]); - groups[i] = NULL; - n--; - } - - return n; -} - -/* Group array references that should be considered together when - * deciding whether to access them from private, shared or global memory. - * Return -1 on error. - * - * In particular, if two array references overlap and if one of them - * is a write, then the two references are grouped together. - * We first perform an initial grouping based only on the access relation. - * After computing shared and private memory tiles, we check for - * overlapping writes again, but this time taking into account - * the depth of the effective tile. - * - * Furthermore, if two groups admit a shared memory tile and if the - * combination of the two also admits a shared memory tile, we merge - * the two groups. - * - * If the array contains structures, then we compute a single - * reference group without trying to find any tiles - * since we do not map such arrays to private or shared - * memory. The only exception is when those arrays of structures - * are required to be mapped to private memory. - */ -static int group_array_references(struct ppcg_kernel *kernel, - struct gpu_local_array_info *local, struct gpu_group_data *data) -{ - int i; - int n; - isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched); - struct gpu_array_ref_group **groups; - - groups = isl_calloc_array(ctx, struct gpu_array_ref_group *, - local->array->n_ref); - if (!groups) - return -1; - - n = populate_array_references(local, groups, data); - - if (local->array->has_compound_element && !local->force_private) { - n = join_all_groups(n, groups); - set_array_groups(local, n, groups); - return 0; - } - - n = group_overlapping_writes(kernel, n, groups, data); - - for (i = 0; i < n; ++i) - if (compute_group_bounds(kernel, groups[i], data) < 0) - n = -1; - - n = group_depth_overlapping_writes(kernel, n, groups, data); - - n = group_common_shared_memory_tile(kernel, local->array, - n, groups, data); - - set_array_groups(local, n, groups); - - if (n >= 0) - return 0; - - for (i = 0; i < local->array->n_ref; ++i) - gpu_array_ref_group_free(groups[i]); - return -1; -} - -/* For each array in the input program that can be mapped to private memory, - * check if there are any order dependences active inside the current kernel, - * within the same iteration of the host schedule, i.e., the prefix - * schedule at "node". - * If so, mark the array as force_private so that its reference groups will be - * mapped to a registers. - * - * Note that the arrays that cannot be mapped to private memory have - * had their order dependences added to prog->array_order and - * subsequently to the coincidence constraints. - */ -static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel, - __isl_keep isl_schedule_node *node) -{ - int i; - isl_union_set *domain; - isl_multi_union_pw_aff *prefix; - isl_union_pw_multi_aff *contraction; - - if (!kernel->options->live_range_reordering) - return; - - kernel->any_force_private = 0; - - prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); - contraction = isl_union_pw_multi_aff_copy(kernel->contraction); - prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix, - contraction); - domain = isl_union_set_copy(kernel->expanded_domain); - domain = isl_union_set_universe(domain); - - for (i = 0; i < kernel->n_array; ++i) { - struct gpu_local_array_info *local = &kernel->array[i]; - isl_union_map *order; - - local->force_private = 0; - if (!gpu_array_can_be_private(local->array)) - continue; - order = isl_union_map_copy(local->array->dep_order); - order = isl_union_map_intersect_domain(order, - isl_union_set_copy(domain)); - order = isl_union_map_intersect_range(order, - isl_union_set_copy(domain)); - order = isl_union_map_eq_at_multi_union_pw_aff(order, - isl_multi_union_pw_aff_copy(prefix)); - if (!isl_union_map_is_empty(order)) { - local->force_private = 1; - kernel->any_force_private = 1; - } - isl_union_map_free(order); - } - - isl_multi_union_pw_aff_free(prefix); - isl_union_set_free(domain); -} - -/* Expand the domain of the schedule "s" by plugging in - * the contraction "contraction" and return the result. - */ -static __isl_give isl_union_map *expand(__isl_take isl_union_map *s, - __isl_keep isl_union_pw_multi_aff *contraction) -{ - contraction = isl_union_pw_multi_aff_copy(contraction); - s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction); - return s; -} - -/* Create a set of dimension data->thread_depth + data->n_thread - * that equates the residue of the final data->n_thread dimensions - * modulo the kernel->block_dim sizes to the thread identifiers. - * Store the computed set in data->privatization. - * - * The construction starts with the space of kernel->thread_filter, - * which is known to reference all thread identifiers. - */ -static void compute_privatization(struct gpu_group_data *data, - struct ppcg_kernel *kernel) -{ - int i; - isl_ctx *ctx; - isl_space *space; - isl_local_space *ls; - isl_set *set; - - ctx = isl_union_map_get_ctx(data->shared_sched); - space = isl_union_set_get_space(kernel->thread_filter); - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, - data->thread_depth + data->n_thread); - set = isl_set_universe(space); - space = isl_set_get_space(set); - ls = isl_local_space_from_space(space); - - for (i = 0; i < data->n_thread; ++i) { - isl_aff *aff, *aff2; - isl_constraint *c; - isl_val *v; - isl_id *id; - int pos; - - aff = isl_aff_var_on_domain(isl_local_space_copy(ls), - isl_dim_set, data->thread_depth + i); - v = isl_val_int_from_si(ctx, kernel->block_dim[i]); - aff = isl_aff_mod_val(aff, v); - id = isl_id_list_get_id(kernel->thread_ids, i); - pos = isl_set_find_dim_by_id(set, isl_dim_param, id); - isl_id_free(id); - aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls), - isl_dim_param, pos); - aff = isl_aff_sub(aff, aff2); - c = isl_equality_from_aff(aff); - set = isl_set_add_constraint(set, c); - } - - isl_local_space_free(ls); - data->privatization = set; -} - -/* Return the prefix schedule at "node" as a relation - * between domain elements and schedule dimensions after detecting - * equalities in this relation. - */ -static __isl_give isl_union_map *prefix_with_equalities( - __isl_keep isl_schedule_node *node) -{ - isl_union_map *schedule; - - schedule = isl_schedule_node_get_prefix_schedule_relation(node); - schedule = isl_union_map_detect_equalities(schedule); - - return schedule; -} - -/* Group references of all arrays in "kernel". - * "node" points to the kernel mark. - * The mapping to shared memory in computed at the "shared" mark. - * - * We first extract all required schedule information into - * a gpu_group_data structure and then consider each array - * in turn. - */ -int gpu_group_references(struct ppcg_kernel *kernel, - __isl_keep isl_schedule_node *node) -{ - int i; - int r = 0; - isl_union_pw_multi_aff *contraction; - struct gpu_group_data data; - - check_can_be_private_live_ranges(kernel, node); - - data.scop = kernel->prog->scop; - - data.kernel_depth = isl_schedule_node_get_schedule_depth(node); - data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node); - - node = isl_schedule_node_copy(node); - node = gpu_tree_move_down_to_shared(node, kernel->core); - data.shared_depth = isl_schedule_node_get_schedule_depth(node); - data.shared_sched = prefix_with_equalities(node); - - node = gpu_tree_move_down_to_thread(node, kernel->core); - node = isl_schedule_node_child(node, 0); - data.thread_depth = isl_schedule_node_get_schedule_depth(node); - data.n_thread = isl_schedule_node_band_n_member(node); - if (data.thread_depth == data.shared_depth) - data.copy_sched = isl_union_map_copy(data.shared_sched); - else - data.copy_sched = prefix_with_equalities(node); - data.thread_sched = isl_union_map_copy(data.copy_sched); - data.thread_sched = isl_union_map_flat_range_product(data.thread_sched, - isl_schedule_node_band_get_partial_schedule_union_map(node)); - data.thread_sched = isl_union_map_detect_equalities(data.thread_sched); - - contraction = isl_union_pw_multi_aff_copy(kernel->contraction); - data.host_sched = expand(data.host_sched, contraction); - data.shared_sched = expand(data.shared_sched, contraction); - if (data.thread_depth == data.shared_depth) { - isl_union_map_free(data.copy_sched); - data.copy_sched = isl_union_map_copy(data.shared_sched); - } else { - data.copy_sched = expand(data.copy_sched, contraction); - } - data.thread_sched = expand(data.thread_sched, contraction); - isl_union_pw_multi_aff_free(contraction); - - node = isl_schedule_node_child(node, 0); - data.full_sched = isl_union_map_copy(data.thread_sched); - data.full_sched = isl_union_map_flat_range_product(data.full_sched, - isl_schedule_node_get_subtree_schedule_union_map(node)); - isl_schedule_node_free(node); - - compute_privatization(&data, kernel); - - for (i = 0; i < kernel->n_array; ++i) { - r = group_array_references(kernel, &kernel->array[i], &data); - if (r < 0) - break; - } - - isl_union_map_free(data.host_sched); - isl_union_map_free(data.shared_sched); - isl_union_map_free(data.copy_sched); - isl_union_map_free(data.thread_sched); - isl_union_map_free(data.full_sched); - isl_set_free(data.privatization); - - return r; -} - -/* Given a description of an array tile "tile" and the "space" - * - * { D -> A } - * - * where D represents the first tile->depth schedule dimensions - * and A represents the array, construct an isl_multi_aff - * - * { [D[i] -> A[a]] -> A'[a'] } - * - * with A' a scaled down copy of A according to the shifts and strides - * in "tile". In particular, - * - * a' = (a + shift(i))/stride - * - * "insert_array" represents - * - * { [D -> A] -> D } - * - * and is used to insert A into the domain of functions that only - * reference D. - */ -static __isl_give isl_multi_aff *strided_tile( - struct gpu_array_tile *tile, __isl_keep isl_space *space, - __isl_keep isl_multi_aff *insert_array) -{ - int i; - isl_ctx *ctx; - isl_multi_aff *shift; - isl_multi_val *stride; - isl_space *space2; - isl_local_space *ls; - isl_multi_aff *tiling; - - ctx = isl_space_get_ctx(space); - space2 = isl_space_domain(isl_space_copy(space)); - ls = isl_local_space_from_space(space2); - space2 = isl_space_range(isl_space_copy(space)); - stride = isl_multi_val_zero(space2); - shift = isl_multi_aff_zero(isl_space_copy(space)); - - for (i = 0; i < tile->n; ++i) { - struct gpu_array_bound *bound = &tile->bound[i]; - isl_val *stride_i; - isl_aff *shift_i; - - if (tile->bound[i].shift) { - stride_i = isl_val_copy(bound->stride); - shift_i = isl_aff_copy(bound->shift); - } else { - stride_i = isl_val_one(ctx); - shift_i = isl_aff_zero_on_domain( - isl_local_space_copy(ls)); - } - - stride = isl_multi_val_set_val(stride, i, stride_i); - shift = isl_multi_aff_set_aff(shift, i, shift_i); - } - isl_local_space_free(ls); - - shift = isl_multi_aff_pullback_multi_aff(shift, - isl_multi_aff_copy(insert_array)); - - tiling = isl_multi_aff_range_map(isl_space_copy(space)); - tiling = isl_multi_aff_add(tiling, shift); - tiling = isl_multi_aff_scale_down_multi_val(tiling, stride); - - return tiling; -} - -/* Compute a tiling for the array reference group "group". - * - * The tiling is of the form - * - * { [D[i] -> A[a]] -> T[t] } - * - * where D represents the first tile->depth schedule dimensions, - * A represents the global array and T represents the shared or - * private memory tile. The name of T is the name of the local - * array. - * - * If there is any stride in the accesses, then the mapping is - * - * t = (a + shift(i))/stride - lb(i) - * - * otherwise, it is simply - * - * t = a - lb(i) - */ -void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group) -{ - int i; - struct gpu_array_tile *tile; - isl_space *space; - isl_multi_aff *tiling, *lb, *insert_array; - isl_printer *p; - char *local_name; - - tile = gpu_array_ref_group_tile(group); - if (!tile) - return; - - space = isl_map_get_space(group->access); - space = isl_space_from_range(isl_space_range(space)); - space = isl_space_add_dims(space, isl_dim_in, tile->depth); - insert_array = isl_multi_aff_domain_map(isl_space_copy(space)); - - for (i = 0; i < tile->n; ++i) - if (tile->bound[i].shift) - break; - - if (i < tile->n) - tiling = strided_tile(tile, space, insert_array); - else - tiling = isl_multi_aff_range_map(isl_space_copy(space)); - - lb = isl_multi_aff_zero(space); - for (i = 0; i < tile->n; ++i) { - isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb); - lb = isl_multi_aff_set_aff(lb, i, lb_i); - } - lb = isl_multi_aff_pullback_multi_aff(lb, insert_array); - - tiling = isl_multi_aff_sub(tiling, lb); - - p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling)); - p = gpu_array_ref_group_print_name(group, p); - local_name = isl_printer_get_str(p); - isl_printer_free(p); - tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name); - free(local_name); - - tile->tiling = tiling; -} diff --git a/polly/lib/External/ppcg/gpu_hybrid.h b/polly/lib/External/ppcg/gpu_hybrid.h deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_hybrid.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef GPU_HYBRID_H -#define GPU_HYBRID_H - -#include - -#include "gpu.h" -#include "hybrid.h" - -__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen, - __isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds, - int *tile_sizes); - -#endif diff --git a/polly/lib/External/ppcg/gpu_hybrid.c b/polly/lib/External/ppcg/gpu_hybrid.c deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_hybrid.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright 2013 Ecole Normale Superieure - * Copyright 2015 Sven Verdoolaege - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France - */ - -#include - -#include -#include -#include -#include - -#include "hybrid.h" -#include "gpu_hybrid.h" -#include "gpu_tree.h" -#include "schedule.h" -#include "util.h" - -/* Have all domain elements been filtered out before reaching - * the "node" position in the schedule tree? - */ -static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node) -{ - isl_union_set *domain; - isl_bool empty; - - domain = isl_schedule_node_get_domain(node); - empty = isl_union_set_is_empty(domain); - isl_union_set_free(domain); - - return empty; -} - -/* Given a pointer to a phase in the result of hybrid tiling, - * map the phase to the device, provided the phase is non-empty. - * Empty phases can occur if the input schedule domain can be - * covered by a small number of hexagons that all belong to the same phase. - * - * The input has the following form: - * - * M - CT - P - C - ... - * - * with M the phase marker, CT the space tiling, P the original - * parent band and C the original child band. - * The (outer dimensions of the) C band need to be mapped to threads. - * The (outer dimension of the) CT band needs to be mapped to blocks. - * The mapping to shared memory needs to be computed between the CT and - * the P band. - * - * The C band is first shifted to start at zero. - * Then the appropriate markers are introduced and a kernel is - * created for the tree rooted at CT. - * If the "unroll_gpu_tile" option is set, then the AST generator - * is instructed to unroll the P and C bands. - */ -static __isl_give isl_schedule_node *update_phase( - __isl_take isl_schedule_node *node, void *user) -{ - struct gpu_gen *gen = user; - int depth0, depth; - isl_ctx *ctx; - isl_id *id; - isl_bool empty_domain; - ppcg_ht_phase *phase; - - empty_domain = has_empty_domain(node); - if (empty_domain < 0) - return isl_schedule_node_free(node); - if (empty_domain) - return node; - - if (!node) - return NULL; - ctx = isl_schedule_node_get_ctx(node); - - phase = ppcg_ht_phase_extract_from_mark(node); - - depth0 = isl_schedule_node_get_tree_depth(node); - - node = isl_schedule_node_child(node, 0); - - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_child(node, 0); - node = ppcg_ht_phase_shift_space_point(phase, node); - if (gen->options->unroll_gpu_tile) - node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); - id = isl_id_alloc(ctx, "thread", NULL); - node = isl_schedule_node_insert_mark(node, id); - node = isl_schedule_node_parent(node); - if (gen->options->unroll_gpu_tile) - node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); - id = isl_id_alloc(ctx, "shared", NULL); - node = isl_schedule_node_insert_mark(node, id); - node = isl_schedule_node_parent(node); - - node = gpu_create_kernel(gen, node, 0, NULL); - - depth = isl_schedule_node_get_tree_depth(node); - node = isl_schedule_node_ancestor(node, depth - depth0); - - return node; -} - -/* Apply hybrid tiling on "node" and its parent based on the (valid) - * bounds on the relative dependence distances "bounds" and - * the tile sizes in "tile_sizes". - * The number of elements in "tile_sizes" is at least as large - * as the sum of the dimensions of the parent and the child node. - * - * Convert the tile_sizes to an isl_multi_val in the right space, - * insert the hybrid tiling and then create a kernel inside each phase. - * Finally, remove the phase marks. - */ -__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen, - __isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds, - int *tile_sizes) -{ - isl_multi_val *mv; - isl_space *space, *space2; - - if (!node || !bounds) - goto error; - - space2 = isl_schedule_node_band_get_space(node); - node = isl_schedule_node_parent(node); - space = isl_schedule_node_band_get_space(node); - space = isl_space_product(space, space2); - mv = ppcg_multi_val_from_int_list(space, tile_sizes); - - node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options); - - node = hybrid_tile_foreach_phase(node, &update_phase, gen); - - node = hybrid_tile_drop_phase_marks(node); - - return node; -error: - isl_schedule_node_free(node); - ppcg_ht_bounds_free(bounds); - return NULL; -} diff --git a/polly/lib/External/ppcg/gpu_print.h b/polly/lib/External/ppcg/gpu_print.h deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_print.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef GPU_PRINT_H -#define GPU_PRINT_H - -#include "gpu.h" - -__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p, - struct gpu_prog *prog); - -__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p, - struct gpu_types *types, struct gpu_prog *prog); - -__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p, - __isl_keep isl_ast_node *node); - -__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn, - struct gpu_array_info *array); -__isl_give isl_printer *gpu_array_info_print_declaration_argument( - __isl_take isl_printer *p, struct gpu_array_info *array, - const char *memory_space); -__isl_give isl_printer *gpu_array_info_print_call_argument( - __isl_take isl_printer *p, struct gpu_array_info *array); - -__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p, - struct ppcg_kernel_stmt *stmt); -__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p, - struct ppcg_kernel_stmt *stmt); - -#endif diff --git a/polly/lib/External/ppcg/gpu_print.c b/polly/lib/External/ppcg/gpu_print.c deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_print.c +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright 2012 Ecole Normale Superieure - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France - */ - -#include - -#include - -#include "gpu_print.h" -#include "print.h" -#include "schedule.h" - -/* Print declarations to "p" for arrays that are local to "prog" - * but that are used on the host and therefore require a declaration. - */ -__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p, - struct gpu_prog *prog) -{ - int i; - - if (!prog) - return isl_printer_free(p); - - for (i = 0; i < prog->n_array; ++i) { - struct gpu_array_info *array = &prog->array[i]; - isl_ast_expr *size; - - if (!array->declare_local) - continue; - size = array->declared_size; - p = ppcg_print_declaration_with_size(p, array->type, size); - } - - return p; -} - -/* Print an expression for the size of "array" in bytes. - */ -__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn, - struct gpu_array_info *array) -{ - int i; - - for (i = 0; i < array->n_index; ++i) { - isl_ast_expr *bound; - - prn = isl_printer_print_str(prn, "("); - bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i); - prn = isl_printer_print_ast_expr(prn, bound); - isl_ast_expr_free(bound); - prn = isl_printer_print_str(prn, ") * "); - } - prn = isl_printer_print_str(prn, "sizeof("); - prn = isl_printer_print_str(prn, array->type); - prn = isl_printer_print_str(prn, ")"); - - return prn; -} - -/* Print the declaration of a non-linearized array argument. - */ -static __isl_give isl_printer *print_non_linearized_declaration_argument( - __isl_take isl_printer *p, struct gpu_array_info *array) -{ - p = isl_printer_print_str(p, array->type); - p = isl_printer_print_str(p, " "); - - p = isl_printer_print_ast_expr(p, array->bound_expr); - - return p; -} - -/* Print the declaration of an array argument. - * "memory_space" allows to specify a memory space prefix. - */ -__isl_give isl_printer *gpu_array_info_print_declaration_argument( - __isl_take isl_printer *p, struct gpu_array_info *array, - const char *memory_space) -{ - if (gpu_array_is_read_only_scalar(array)) { - p = isl_printer_print_str(p, array->type); - p = isl_printer_print_str(p, " "); - p = isl_printer_print_str(p, array->name); - return p; - } - - if (memory_space) { - p = isl_printer_print_str(p, memory_space); - p = isl_printer_print_str(p, " "); - } - - if (array->n_index != 0 && !array->linearize) - return print_non_linearized_declaration_argument(p, array); - - p = isl_printer_print_str(p, array->type); - p = isl_printer_print_str(p, " "); - p = isl_printer_print_str(p, "*"); - p = isl_printer_print_str(p, array->name); - - return p; -} - -/* Print the call of an array argument. - */ -__isl_give isl_printer *gpu_array_info_print_call_argument( - __isl_take isl_printer *p, struct gpu_array_info *array) -{ - if (gpu_array_is_read_only_scalar(array)) - return isl_printer_print_str(p, array->name); - - p = isl_printer_print_str(p, "dev_"); - p = isl_printer_print_str(p, array->name); - - return p; -} - -/* Print an access to the element in the private/shared memory copy - * described by "stmt". The index of the copy is recorded in - * stmt->local_index as an access to the array. - */ -static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p, - struct ppcg_kernel_stmt *stmt) -{ - return isl_printer_print_ast_expr(p, stmt->u.c.local_index); -} - -/* Print an access to the element in the global memory copy - * described by "stmt". The index of the copy is recorded in - * stmt->index as an access to the array. - */ -static __isl_give isl_printer *stmt_print_global_index( - __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt) -{ - struct gpu_array_info *array = stmt->u.c.array; - isl_ast_expr *index; - - if (gpu_array_is_scalar(array)) { - if (!gpu_array_is_read_only_scalar(array)) - p = isl_printer_print_str(p, "*"); - p = isl_printer_print_str(p, array->name); - return p; - } - - index = isl_ast_expr_copy(stmt->u.c.index); - - p = isl_printer_print_ast_expr(p, index); - isl_ast_expr_free(index); - - return p; -} - -/* Print a copy statement. - * - * A read copy statement is printed as - * - * local = global; - * - * while a write copy statement is printed as - * - * global = local; - */ -__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p, - struct ppcg_kernel_stmt *stmt) -{ - p = isl_printer_start_line(p); - if (stmt->u.c.read) { - p = stmt_print_local_index(p, stmt); - p = isl_printer_print_str(p, " = "); - p = stmt_print_global_index(p, stmt); - } else { - p = stmt_print_global_index(p, stmt); - p = isl_printer_print_str(p, " = "); - p = stmt_print_local_index(p, stmt); - } - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - - return p; -} - -__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p, - struct ppcg_kernel_stmt *stmt) -{ - return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr); -} - -/* This function is called for each node in a GPU AST. - * In case of a user node, print the macro definitions required - * for printing the AST expressions in the annotation, if any. - * For other nodes, return true such that descendants are also - * visited. - * - * In particular, for a kernel launch, print the macro definitions - * needed for the grid size. - * For a copy statement, print the macro definitions needed - * for the two index expressions. - * For an original user statement, print the macro definitions - * needed for the substitutions. - */ -static isl_bool at_node(__isl_keep isl_ast_node *node, void *user) -{ - const char *name; - isl_id *id; - int is_kernel; - struct ppcg_kernel *kernel; - struct ppcg_kernel_stmt *stmt; - isl_printer **p = user; - - if (isl_ast_node_get_type(node) != isl_ast_node_user) - return isl_bool_true; - - id = isl_ast_node_get_annotation(node); - if (!id) - return isl_bool_false; - - name = isl_id_get_name(id); - if (!name) - return isl_bool_error; - is_kernel = !strcmp(name, "kernel"); - kernel = is_kernel ? isl_id_get_user(id) : NULL; - stmt = is_kernel ? NULL : isl_id_get_user(id); - isl_id_free(id); - - if ((is_kernel && !kernel) || (!is_kernel && !stmt)) - return isl_bool_error; - - if (is_kernel) { - *p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p); - } else if (stmt->type == ppcg_kernel_copy) { - *p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p); - *p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p); - } else if (stmt->type == ppcg_kernel_domain) { - *p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr); - } - if (!*p) - return isl_bool_error; - - return isl_bool_false; -} - -/* Print the required macros for the GPU AST "node" to "p", - * including those needed for the user statements inside the AST. - */ -__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p, - __isl_keep isl_ast_node *node) -{ - if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0) - return isl_printer_free(p); - p = ppcg_print_macros(p, node); - return p; -} - -/* Was the definition of "type" printed before? - * That is, does its name appear in the list of printed types "types"? - */ -static int already_printed(struct gpu_types *types, - struct pet_type *type) -{ - int i; - - for (i = 0; i < types->n; ++i) - if (!strcmp(types->name[i], type->name)) - return 1; - - return 0; -} - -/* Print the definitions of all types prog->scop that have not been - * printed before (according to "types") on "p". - * Extend the list of printed types "types" with the newly printed types. - */ -__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p, - struct gpu_types *types, struct gpu_prog *prog) -{ - int i, n; - isl_ctx *ctx; - char **name; - - n = prog->scop->pet->n_type; - - if (n == 0) - return p; - - ctx = isl_printer_get_ctx(p); - name = isl_realloc_array(ctx, types->name, char *, types->n + n); - if (!name) - return isl_printer_free(p); - types->name = name; - - for (i = 0; i < n; ++i) { - struct pet_type *type = prog->scop->pet->types[i]; - - if (already_printed(types, type)) - continue; - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, type->definition); - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - - types->name[types->n++] = strdup(type->name); - } - - return p; -} diff --git a/polly/lib/External/ppcg/gpu_tree.h b/polly/lib/External/ppcg/gpu_tree.h deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_tree.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef GPU_TREE_H -#define GPU_TREE_H - -#include - -#include "gpu.h" - -__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread( - __isl_take isl_schedule_node *node); -int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node); -__isl_give isl_schedule_node *gpu_tree_move_down_to_shared( - __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core); -__isl_give isl_schedule_node *gpu_tree_move_up_to_thread( - __isl_take isl_schedule_node *node); -__isl_give isl_schedule_node *gpu_tree_move_down_to_thread( - __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core); -__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel( - __isl_take isl_schedule_node *node); -__isl_give isl_schedule_node *gpu_tree_move_down_to_depth( - __isl_take isl_schedule_node *node, int depth, - __isl_keep isl_union_set *core); - -int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel); -__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); -__isl_give isl_schedule_node *gpu_tree_ensure_following_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); -__isl_give isl_schedule_node *gpu_tree_move_left_to_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); -__isl_give isl_schedule_node *gpu_tree_move_right_to_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel); - -#endif diff --git a/polly/lib/External/ppcg/gpu_tree.c b/polly/lib/External/ppcg/gpu_tree.c deleted file mode 100644 --- a/polly/lib/External/ppcg/gpu_tree.c +++ /dev/null @@ -1,640 +0,0 @@ -/* - * Copyright 2013 Ecole Normale Superieure - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France - */ - -#include - -#include -#include -#include - -#include "gpu_tree.h" - -/* The functions in this file are used to navigate part of a schedule tree - * that is mapped to blocks. Initially, this part consists of a linear - * branch segment with a mark node with name "kernel" on the outer end - * and a mark node with name "thread" on the inner end. - * During the mapping to blocks, branching may be introduced, but only - * one of the elements in each sequence contains the "thread" mark. - * The filter of this element (and only this filter) contains - * domain elements identified by the "core" argument of the functions - * that move down this tree. - * - * Synchronization statements have a name that starts with "sync" and - * a user pointer pointing to the kernel that contains the synchronization. - * The functions inserting or detecting synchronizations take a ppcg_kernel - * argument to be able to create or identify such statements. - * They may also use two fields in this structure, the "core" field - * to move around in the tree and the "n_sync" field to make sure that - * each synchronization has a different name (within the kernel). - */ - -/* Is "node" a mark node with an identifier called "name"? - */ -static int is_marked(__isl_keep isl_schedule_node *node, const char *name) -{ - isl_id *mark; - int has_name; - - if (!node) - return -1; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) - return 0; - - mark = isl_schedule_node_mark_get_id(node); - if (!mark) - return -1; - - has_name = !strcmp(isl_id_get_name(mark), name); - isl_id_free(mark); - - return has_name; -} - -/* Is "node" a mark node with an identifier called "kernel"? - */ -int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node) -{ - return is_marked(node, "kernel"); -} - -/* Is "node" a mark node with an identifier called "shared"? - */ -static int node_is_shared(__isl_keep isl_schedule_node *node) -{ - return is_marked(node, "shared"); -} - -/* Is "node" a mark node with an identifier called "thread"? - */ -static int node_is_thread(__isl_keep isl_schedule_node *node) -{ - return is_marked(node, "thread"); -} - -/* Insert a mark node with identifier "shared" in front of "node". - */ -static __isl_give isl_schedule_node *insert_shared( - __isl_take isl_schedule_node *node) -{ - isl_ctx *ctx; - isl_id *id; - - ctx = isl_schedule_node_get_ctx(node); - id = isl_id_alloc(ctx, "shared", NULL); - node = isl_schedule_node_insert_mark(node, id); - - return node; -} - -/* Insert a "shared" mark in front of the "thread" mark - * provided the linear branch between "node" and the "thread" mark - * does not contain such a "shared" mark already. - * - * As a side effect, this function checks that the subtree at "node" - * actually contains a "thread" mark and that there is no branching - * in between "node" and this "thread" mark. - */ -__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread( - __isl_take isl_schedule_node *node) -{ - int depth0, depth; - int any_shared = 0; - - if (!node) - return NULL; - - depth0 = isl_schedule_node_get_tree_depth(node); - - for (;;) { - int is_thread; - int n; - - if (!any_shared) { - any_shared = node_is_shared(node); - if (any_shared < 0) - return isl_schedule_node_free(node); - } - is_thread = node_is_thread(node); - if (is_thread < 0) - return isl_schedule_node_free(node); - if (is_thread) - break; - n = isl_schedule_node_n_children(node); - if (n == 0) - isl_die(isl_schedule_node_get_ctx(node), - isl_error_invalid, - "no thread marker found", - return isl_schedule_node_free(node)); - if (n > 1) - isl_die(isl_schedule_node_get_ctx(node), - isl_error_invalid, - "expecting single thread marker", - return isl_schedule_node_free(node)); - - node = isl_schedule_node_child(node, 0); - } - - if (!any_shared) - node = insert_shared(node); - depth = isl_schedule_node_get_tree_depth(node); - node = isl_schedule_node_ancestor(node, depth - depth0); - - return node; -} - -/* Assuming "node" is a filter node, does it correspond to the branch - * that contains the "thread" mark, i.e., does it contain any elements - * in "core"? - */ -static int node_is_core(__isl_keep isl_schedule_node *node, - __isl_keep isl_union_set *core) -{ - int disjoint; - isl_union_set *filter; - - filter = isl_schedule_node_filter_get_filter(node); - disjoint = isl_union_set_is_disjoint(filter, core); - isl_union_set_free(filter); - if (disjoint < 0) - return -1; - - return !disjoint; -} - -/* Move to the only child of "node" that has the "thread" mark as descendant, - * where the branch containing this mark is identified by the domain elements - * in "core". - * - * If "node" is not a sequence, then it only has one child and we move - * to that single child. - * Otherwise, we check each of the filters in the children, pick - * the one that corresponds to "core" and return a pointer to the child - * of the filter node. - */ -static __isl_give isl_schedule_node *core_child( - __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core) -{ - int i, n; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence) - return isl_schedule_node_child(node, 0); - - n = isl_schedule_node_n_children(node); - for (i = 0; i < n; ++i) { - int is_core; - - node = isl_schedule_node_child(node, i); - is_core = node_is_core(node, core); - - if (is_core < 0) - return isl_schedule_node_free(node); - if (is_core) - return isl_schedule_node_child(node, 0); - - node = isl_schedule_node_parent(node); - } - - isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, - "core child not found", return isl_schedule_node_free(node)); -} - -/* Move down the branch between "kernel" and "thread" until - * the "shared" mark is reached, where the branch containing the "shared" - * mark is identified by the domain elements in "core". - */ -__isl_give isl_schedule_node *gpu_tree_move_down_to_shared( - __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core) -{ - int is_shared; - - while ((is_shared = node_is_shared(node)) == 0) - node = core_child(node, core); - if (is_shared < 0) - node = isl_schedule_node_free(node); - - return node; -} - -/* Move down the branch between "kernel" and "thread" until - * the "thread" mark is reached, where the branch containing the "thread" - * mark is identified by the domain elements in "core". - */ -__isl_give isl_schedule_node *gpu_tree_move_down_to_thread( - __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core) -{ - int is_thread; - - while ((is_thread = node_is_thread(node)) == 0) - node = core_child(node, core); - if (is_thread < 0) - node = isl_schedule_node_free(node); - - return node; -} - -/* Move up the tree underneath the "thread" mark until - * the "thread" mark is reached. - */ -__isl_give isl_schedule_node *gpu_tree_move_up_to_thread( - __isl_take isl_schedule_node *node) -{ - int is_thread; - - while ((is_thread = node_is_thread(node)) == 0) - node = isl_schedule_node_parent(node); - if (is_thread < 0) - node = isl_schedule_node_free(node); - - return node; -} - -/* Move up the tree underneath the "kernel" mark until - * the "kernel" mark is reached. - */ -__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel( - __isl_take isl_schedule_node *node) -{ - int is_kernel; - - while ((is_kernel = gpu_tree_node_is_kernel(node)) == 0) - node = isl_schedule_node_parent(node); - if (is_kernel < 0) - node = isl_schedule_node_free(node); - - return node; -} - -/* Move down from the "kernel" mark (or at least a node with schedule - * depth smaller than or equal to "depth") to a band node at schedule - * depth "depth". The "thread" mark is assumed to have a schedule - * depth greater than or equal to "depth". The branch containing the - * "thread" mark is identified by the domain elements in "core". - * - * If the desired schedule depth is in the middle of band node, - * then the band node is split into two pieces, the second piece - * at the desired schedule depth. - */ -__isl_give isl_schedule_node *gpu_tree_move_down_to_depth( - __isl_take isl_schedule_node *node, int depth, - __isl_keep isl_union_set *core) -{ - int is_shared; - int is_thread = 0; - - while (node && isl_schedule_node_get_schedule_depth(node) < depth) { - if (isl_schedule_node_get_type(node) == - isl_schedule_node_band) { - int node_depth, node_dim; - node_depth = isl_schedule_node_get_schedule_depth(node); - node_dim = isl_schedule_node_band_n_member(node); - if (node_depth + node_dim > depth) - node = isl_schedule_node_band_split(node, - depth - node_depth); - } - node = core_child(node, core); - } - while ((is_shared = node_is_shared(node)) == 0 && - (is_thread = node_is_thread(node)) == 0 && - isl_schedule_node_get_type(node) != isl_schedule_node_band) - node = core_child(node, core); - if (is_shared < 0 || is_thread < 0) - node = isl_schedule_node_free(node); - - return node; -} - -/* Create a union set containing a single set with a tuple identifier - * called "syncX" and user pointer equal to "kernel". - */ -static __isl_give isl_union_set *create_sync_domain(struct ppcg_kernel *kernel) -{ - isl_space *space; - isl_id *id; - char name[40]; - - space = isl_space_set_alloc(kernel->ctx, 0, 0); - snprintf(name, sizeof(name), "sync%d", kernel->n_sync++); - id = isl_id_alloc(kernel->ctx, name, kernel); - space = isl_space_set_tuple_id(space, isl_dim_set, id); - return isl_union_set_from_set(isl_set_universe(space)); -} - -/* Is "id" the identifier of a synchronization statement inside "kernel"? - * That is, does its name start with "sync" and does it point to "kernel"? - */ -int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel) -{ - const char *name; - - name = isl_id_get_name(id); - if (!name) - return 0; - else if (strncmp(name, "sync", 4)) - return 0; - return isl_id_get_user(id) == kernel; -} - -/* Does "domain" consist of a single set with a tuple identifier - * corresponding to a synchronization for "kernel"? - */ -static int domain_is_sync(__isl_keep isl_union_set *domain, - struct ppcg_kernel *kernel) -{ - int is_sync; - isl_id *id; - isl_set *set; - - if (isl_union_set_n_set(domain) != 1) - return 0; - set = isl_set_from_union_set(isl_union_set_copy(domain)); - id = isl_set_get_tuple_id(set); - is_sync = gpu_tree_id_is_sync(id, kernel); - isl_id_free(id); - isl_set_free(set); - - return is_sync; -} - -/* Does "node" point to a filter selecting a synchronization statement - * for "kernel"? - */ -static int node_is_sync_filter(__isl_keep isl_schedule_node *node, - struct ppcg_kernel *kernel) -{ - int is_sync; - enum isl_schedule_node_type type; - isl_union_set *domain; - - if (!node) - return -1; - type = isl_schedule_node_get_type(node); - if (type != isl_schedule_node_filter) - return 0; - domain = isl_schedule_node_filter_get_filter(node); - is_sync = domain_is_sync(domain, kernel); - isl_union_set_free(domain); - - return is_sync; -} - -/* Is "node" part of a sequence with a previous synchronization statement - * for "kernel"? - * That is, is the parent of "node" a filter such that there is - * a previous filter that picks out exactly such a synchronization statement? - */ -static int has_preceding_sync(__isl_keep isl_schedule_node *node, - struct ppcg_kernel *kernel) -{ - int found = 0; - - node = isl_schedule_node_copy(node); - node = isl_schedule_node_parent(node); - while (!found && isl_schedule_node_has_previous_sibling(node)) { - node = isl_schedule_node_previous_sibling(node); - if (!node) - break; - found = node_is_sync_filter(node, kernel); - } - if (!node) - found = -1; - isl_schedule_node_free(node); - - return found; -} - -/* Is "node" part of a sequence with a subsequent synchronization statement - * for "kernel"? - * That is, is the parent of "node" a filter such that there is - * a subsequent filter that picks out exactly such a synchronization statement? - */ -static int has_following_sync(__isl_keep isl_schedule_node *node, - struct ppcg_kernel *kernel) -{ - int found = 0; - - node = isl_schedule_node_copy(node); - node = isl_schedule_node_parent(node); - while (!found && isl_schedule_node_has_next_sibling(node)) { - node = isl_schedule_node_next_sibling(node); - if (!node) - break; - found = node_is_sync_filter(node, kernel); - } - if (!node) - found = -1; - isl_schedule_node_free(node); - - return found; -} - -/* Does the subtree rooted at "node" (which is a band node) contain - * any synchronization statement for "kernel" that precedes - * the core computation of "kernel" (identified by the elements - * in kernel->core)? - */ -static int has_sync_before_core(__isl_keep isl_schedule_node *node, - struct ppcg_kernel *kernel) -{ - int has_sync = 0; - int is_thread; - - node = isl_schedule_node_copy(node); - while ((is_thread = node_is_thread(node)) == 0) { - node = core_child(node, kernel->core); - has_sync = has_preceding_sync(node, kernel); - if (has_sync < 0 || has_sync) - break; - } - if (is_thread < 0 || !node) - has_sync = -1; - isl_schedule_node_free(node); - - return has_sync; -} - -/* Does the subtree rooted at "node" (which is a band node) contain - * any synchronization statement for "kernel" that follows - * the core computation of "kernel" (identified by the elements - * in kernel->core)? - */ -static int has_sync_after_core(__isl_keep isl_schedule_node *node, - struct ppcg_kernel *kernel) -{ - int has_sync = 0; - int is_thread; - - node = isl_schedule_node_copy(node); - while ((is_thread = node_is_thread(node)) == 0) { - node = core_child(node, kernel->core); - has_sync = has_following_sync(node, kernel); - if (has_sync < 0 || has_sync) - break; - } - if (is_thread < 0 || !node) - has_sync = -1; - isl_schedule_node_free(node); - - return has_sync; -} - -/* Insert (or extend) an extension on top of "node" that puts - * a synchronization node for "kernel" before "node". - * Return a pointer to the original node in the updated schedule tree. - */ -static __isl_give isl_schedule_node *insert_sync_before( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) -{ - isl_union_set *domain; - isl_schedule_node *graft; - - if (!node) - return NULL; - - domain = create_sync_domain(kernel); - graft = isl_schedule_node_from_domain(domain); - node = isl_schedule_node_graft_before(node, graft); - - return node; -} - -/* Insert (or extend) an extension on top of "node" that puts - * a synchronization node for "kernel" afater "node". - * Return a pointer to the original node in the updated schedule tree. - */ -static __isl_give isl_schedule_node *insert_sync_after( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) -{ - isl_union_set *domain; - isl_schedule_node *graft; - - if (!node) - return NULL; - - domain = create_sync_domain(kernel); - graft = isl_schedule_node_from_domain(domain); - node = isl_schedule_node_graft_after(node, graft); - - return node; -} - -/* Insert an extension on top of "node" that puts a synchronization node - * for "kernel" before "node" unless there already is - * such a synchronization node. - */ -__isl_give isl_schedule_node *gpu_tree_ensure_preceding_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) -{ - int has_sync; - - has_sync = has_preceding_sync(node, kernel); - if (has_sync < 0) - return isl_schedule_node_free(node); - if (has_sync) - return node; - return insert_sync_before(node, kernel); -} - -/* Insert an extension on top of "node" that puts a synchronization node - * for "kernel" after "node" unless there already is - * such a synchronization node. - */ -__isl_give isl_schedule_node *gpu_tree_ensure_following_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) -{ - int has_sync; - - has_sync = has_following_sync(node, kernel); - if (has_sync < 0) - return isl_schedule_node_free(node); - if (has_sync) - return node; - return insert_sync_after(node, kernel); -} - -/* Insert an extension on top of "node" that puts a synchronization node - * for "kernel" after "node" unless there already is such a sync node or - * "node" itself already * contains a synchronization node following - * the core computation of "kernel". - */ -__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) -{ - int has_sync; - - has_sync = has_sync_after_core(node, kernel); - if (has_sync < 0) - return isl_schedule_node_free(node); - if (has_sync) - return node; - has_sync = has_following_sync(node, kernel); - if (has_sync < 0) - return isl_schedule_node_free(node); - if (has_sync) - return node; - return insert_sync_after(node, kernel); -} - -/* Move left in the sequence on top of "node" to a synchronization node - * for "kernel". - * If "node" itself contains a synchronization node preceding - * the core computation of "kernel", then return "node" itself. - * Otherwise, if "node" does not have a preceding synchronization node, - * then create one first. - */ -__isl_give isl_schedule_node *gpu_tree_move_left_to_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) -{ - int has_sync; - int is_sync; - - has_sync = has_sync_before_core(node, kernel); - if (has_sync < 0) - return isl_schedule_node_free(node); - if (has_sync) - return node; - node = gpu_tree_ensure_preceding_sync(node, kernel); - node = isl_schedule_node_parent(node); - while ((is_sync = node_is_sync_filter(node, kernel)) == 0) - node = isl_schedule_node_previous_sibling(node); - if (is_sync < 0) - node = isl_schedule_node_free(node); - node = isl_schedule_node_child(node, 0); - - return node; -} - -/* Move right in the sequence on top of "node" to a synchronization node - * for "kernel". - * If "node" itself contains a synchronization node following - * the core computation of "kernel", then return "node" itself. - * Otherwise, if "node" does not have a following synchronization node, - * then create one first. - */ -__isl_give isl_schedule_node *gpu_tree_move_right_to_sync( - __isl_take isl_schedule_node *node, struct ppcg_kernel *kernel) -{ - int has_sync; - int is_sync; - - has_sync = has_sync_after_core(node, kernel); - if (has_sync < 0) - return isl_schedule_node_free(node); - if (has_sync) - return node; - node = gpu_tree_ensure_following_sync(node, kernel); - node = isl_schedule_node_parent(node); - while ((is_sync = node_is_sync_filter(node, kernel)) == 0) - node = isl_schedule_node_next_sibling(node); - if (is_sync < 0) - node = isl_schedule_node_free(node); - node = isl_schedule_node_child(node, 0); - - return node; -} diff --git a/polly/lib/External/ppcg/grouping.c b/polly/lib/External/ppcg/grouping.c deleted file mode 100644 --- a/polly/lib/External/ppcg/grouping.c +++ /dev/null @@ -1,684 +0,0 @@ -/* - * Copyright 2016 Sven Verdoolaege - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ppcg.h" - -/* Internal data structure for use during the detection of statements - * that can be grouped. - * - * "sc" contains the original schedule constraints (not a copy). - * "dep" contains the intersection of the validity and the proximity - * constraints in "sc". It may be NULL if it has not been computed yet. - * "group_id" is the identifier for the next group that is extracted. - * - * "domain" is the set of statement instances that belong to any of the groups. - * "contraction" maps the elements of "domain" to the corresponding group - * instances. - * "schedule" schedules the statements in each group relatively to each other. - * These last three fields are NULL if no groups have been found so far. - */ -struct ppcg_grouping { - isl_schedule_constraints *sc; - - isl_union_map *dep; - int group_id; - - isl_union_set *domain; - isl_union_pw_multi_aff *contraction; - isl_schedule *schedule; -}; - -/* Clear all memory allocated by "grouping". - */ -static void ppcg_grouping_clear(struct ppcg_grouping *grouping) -{ - isl_union_map_free(grouping->dep); - isl_union_set_free(grouping->domain); - isl_union_pw_multi_aff_free(grouping->contraction); - isl_schedule_free(grouping->schedule); -} - -/* Compute the intersection of the proximity and validity dependences - * in grouping->sc and store the result in grouping->dep, unless - * this intersection has been computed before. - */ -static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping) -{ - isl_union_map *validity, *proximity; - - if (grouping->dep) - return isl_stat_ok; - - validity = isl_schedule_constraints_get_validity(grouping->sc); - proximity = isl_schedule_constraints_get_proximity(grouping->sc); - grouping->dep = isl_union_map_intersect(validity, proximity); - - if (!grouping->dep) - return isl_stat_error; - - return isl_stat_ok; -} - -/* Information extracted from one or more consecutive leaves - * in the input schedule. - * - * "list" contains the sets of statement instances in the leaves, - * one element in the list for each original leaf. - * "domain" contains the union of the sets in "list". - * "prefix" contains the prefix schedule of these elements. - */ -struct ppcg_grouping_leaf { - isl_union_set *domain; - isl_union_set_list *list; - isl_multi_union_pw_aff *prefix; -}; - -/* Free all memory allocated for "leaves". - */ -static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[]) -{ - int i; - - if (!leaves) - return; - - for (i = 0; i < n; ++i) { - isl_union_set_free(leaves[i].domain); - isl_union_set_list_free(leaves[i].list); - isl_multi_union_pw_aff_free(leaves[i].prefix); - } - - free(leaves); -} - -/* Short-hand for retrieving the prefix schedule at "node" - * in the form of an isl_multi_union_pw_aff. - */ -static __isl_give isl_multi_union_pw_aff *get_prefix( - __isl_keep isl_schedule_node *node) -{ - return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); -} - -/* Return an array of "n" elements with information extracted from - * the "n" children of "node" starting at "first", all of which - * are known to be filtered leaves. - */ -struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node, - int first, int n) -{ - int i; - isl_ctx *ctx; - struct ppcg_grouping_leaf *leaves; - - if (!node) - return NULL; - - ctx = isl_schedule_node_get_ctx(node); - leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n); - if (!leaves) - return NULL; - - for (i = 0; i < n; ++i) { - isl_schedule_node *child; - isl_union_set *domain; - - child = isl_schedule_node_get_child(node, first + i); - child = isl_schedule_node_child(child, 0); - domain = isl_schedule_node_get_domain(child); - leaves[i].domain = isl_union_set_copy(domain); - leaves[i].list = isl_union_set_list_from_union_set(domain); - leaves[i].prefix = get_prefix(child); - isl_schedule_node_free(child); - } - - return leaves; -} - -/* Internal data structure used by merge_leaves. - * - * "src" and "dst" point to the two consecutive leaves that are - * under investigation for being merged. - * "merge" is initially set to 0 and is set to 1 as soon as - * it turns out that it is useful to merge the two leaves. - */ -struct ppcg_merge_leaves_data { - int merge; - struct ppcg_grouping_leaf *src; - struct ppcg_grouping_leaf *dst; -}; - -/* Given a relation "map" between instances of two statements A and B, - * does it relate every instance of A (according to the domain of "src") - * to every instance of B (according to the domain of "dst")? - */ -static isl_bool covers_src_and_dst(__isl_keep isl_map *map, - struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst) -{ - isl_space *space; - isl_set *set1, *set2; - isl_bool is_subset; - - space = isl_space_domain(isl_map_get_space(map)); - set1 = isl_union_set_extract_set(src->domain, space); - set2 = isl_map_domain(isl_map_copy(map)); - is_subset = isl_set_is_subset(set1, set2); - isl_set_free(set1); - isl_set_free(set2); - if (is_subset < 0 || !is_subset) - return is_subset; - - space = isl_space_range(isl_map_get_space(map)); - set1 = isl_union_set_extract_set(dst->domain, space); - set2 = isl_map_range(isl_map_copy(map)); - is_subset = isl_set_is_subset(set1, set2); - isl_set_free(set1); - isl_set_free(set2); - - return is_subset; -} - -/* Given a relation "map" between instances of two statements A and B, - * are pairs of related instances executed together in the input schedule? - * That is, is each pair of instances assigned the same value - * by the corresponding prefix schedules? - * - * In particular, select the subset of "map" that has pairs of elements - * with the same value for the prefix schedules and then check - * if "map" is still a subset of the result. - */ -static isl_bool matches_prefix(__isl_keep isl_map *map, - struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst) -{ - isl_union_map *umap, *equal; - isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix; - isl_bool is_subset; - - src_prefix = isl_multi_union_pw_aff_copy(src->prefix); - dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix); - prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix); - - umap = isl_union_map_from_map(isl_map_copy(map)); - equal = isl_union_map_copy(umap); - equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix); - - is_subset = isl_union_map_is_subset(umap, equal); - - isl_union_map_free(umap); - isl_union_map_free(equal); - - return is_subset; -} - -/* Given a set of validity and proximity schedule constraints "map" - * between statements in consecutive leaves in a valid schedule, - * should the two leaves be merged into one? - * - * In particular, the two are merged if the constraints form - * a bijection between every instance of the first statement and - * every instance of the second statement. Moreover, each - * pair of such dependent instances needs to be executed consecutively - * in the input schedule. That is, they need to be assigned - * the same value by their prefix schedules. - * - * What this means is that for each instance of the first statement - * there is exactly one instance of the second statement that - * is executed immediately after the instance of the first statement and - * that, moreover, both depends on this statement instance and - * should be brought as close as possible to this statement instance. - * In other words, it is both possible to execute the two instances - * together (according to the input schedule) and desirable to do so - * (according to the validity and proximity schedule constraints). - */ -static isl_stat check_merge(__isl_take isl_map *map, void *user) -{ - struct ppcg_merge_leaves_data *data = user; - isl_bool ok; - - ok = covers_src_and_dst(map, data->src, data->dst); - if (ok >= 0 && ok) - ok = isl_map_is_bijective(map); - if (ok >= 0 && ok) - ok = matches_prefix(map, data->src, data->dst); - - isl_map_free(map); - - if (ok < 0) - return isl_stat_error; - if (!ok) - return isl_stat_ok; - - data->merge = 1; - return isl_stat_error; -} - -/* Merge the leaves at position "pos" and "pos + 1" in "leaves". - */ -static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[], int pos) -{ - int i; - - leaves[pos].domain = isl_union_set_union(leaves[pos].domain, - leaves[pos + 1].domain); - leaves[pos].list = isl_union_set_list_concat(leaves[pos].list, - leaves[pos + 1].list); - leaves[pos].prefix = isl_multi_union_pw_aff_union_add( - leaves[pos].prefix, leaves[pos + 1].prefix); - for (i = pos + 1; i + 1 < n; ++i) - leaves[i] = leaves[i + 1]; - leaves[n - 1].domain = NULL; - leaves[n - 1].list = NULL; - leaves[n - 1].prefix = NULL; - - if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix) - return isl_stat_error; - - return isl_stat_ok; -} - -/* Merge pairs of consecutive leaves in "leaves" taking into account - * the intersection of validity and proximity schedule constraints "dep". - * - * If a leaf has been merged with the next leaf, then the combination - * is checked again for merging with the next leaf. - * That is, if the leaves are A, B and C, then B may not have been - * merged with C, but after merging A and B, it could still be useful - * to merge the combination AB with C. - * - * Two leaves A and B are merged if there are instances of at least - * one pair of statements, one statement in A and one B, such that - * the validity and proximity schedule constraints between them - * make them suitable for merging according to check_merge. - * - * Return the final number of leaves in the sequence, or -1 on error. - */ -static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[], - __isl_keep isl_union_map *dep) -{ - int i; - struct ppcg_merge_leaves_data data; - - for (i = n - 1; i >= 0; --i) { - isl_union_map *dep_i; - isl_stat ok; - - if (i + 1 >= n) - continue; - - dep_i = isl_union_map_copy(dep); - dep_i = isl_union_map_intersect_domain(dep_i, - isl_union_set_copy(leaves[i].domain)); - dep_i = isl_union_map_intersect_range(dep_i, - isl_union_set_copy(leaves[i + 1].domain)); - data.merge = 0; - data.src = &leaves[i]; - data.dst = &leaves[i + 1]; - ok = isl_union_map_foreach_map(dep_i, &check_merge, &data); - isl_union_map_free(dep_i); - if (ok < 0 && !data.merge) - return -1; - if (!data.merge) - continue; - if (merge_pair(n, leaves, i) < 0) - return -1; - --n; - ++i; - } - - return n; -} - -/* Construct a schedule with "domain" as domain, that executes - * the elements of "list" in order (as a sequence). - */ -static __isl_give isl_schedule *schedule_from_domain_and_list( - __isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list) -{ - isl_schedule *schedule; - isl_schedule_node *node; - - schedule = isl_schedule_from_domain(isl_union_set_copy(domain)); - node = isl_schedule_get_root(schedule); - isl_schedule_free(schedule); - node = isl_schedule_node_child(node, 0); - list = isl_union_set_list_copy(list); - node = isl_schedule_node_insert_sequence(node, list); - schedule = isl_schedule_node_get_schedule(node); - isl_schedule_node_free(node); - - return schedule; -} - -/* Construct a unique identifier for a group in "grouping". - * - * The name is of the form G_n, with n the first value starting at - * grouping->group_id that does not result in an identifier - * that is already in use in the domain of the original schedule - * constraints. - */ -static isl_id *construct_group_id(struct ppcg_grouping *grouping, - __isl_take isl_space *space) -{ - isl_ctx *ctx; - isl_id *id; - isl_bool empty; - isl_union_set *domain; - - if (!space) - return NULL; - - ctx = isl_space_get_ctx(space); - domain = isl_schedule_constraints_get_domain(grouping->sc); - - do { - char buffer[20]; - isl_id *id; - isl_set *set; - - snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id); - grouping->group_id++; - id = isl_id_alloc(ctx, buffer, NULL); - space = isl_space_set_tuple_id(space, isl_dim_set, id); - set = isl_union_set_extract_set(domain, isl_space_copy(space)); - empty = isl_set_plain_is_empty(set); - isl_set_free(set); - } while (empty >= 0 && !empty); - - if (empty < 0) - space = isl_space_free(space); - - id = isl_space_get_tuple_id(space, isl_dim_set); - - isl_space_free(space); - isl_union_set_free(domain); - - return id; -} - -/* Construct a contraction from "prefix" and "domain" for a new group - * in "grouping". - * - * The values of the prefix schedule "prefix" are used as instances - * of the new group. The identifier of the group is constructed - * in such a way that it does not conflict with those of earlier - * groups nor with statements in the domain of the original - * schedule constraints. - * The isl_multi_union_pw_aff "prefix" then simply needs to be - * converted to an isl_union_pw_multi_aff. However, this is not - * possible if "prefix" is zero-dimensional, so in this case, - * a contraction is constructed from "domain" instead. - */ -static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain( - struct ppcg_grouping *grouping, - __isl_keep isl_multi_union_pw_aff *prefix, - __isl_keep isl_union_set *domain) -{ - isl_id *id; - isl_space *space; - int dim; - - space = isl_multi_union_pw_aff_get_space(prefix); - if (!space) - return NULL; - dim = isl_space_dim(space, isl_dim_set); - id = construct_group_id(grouping, space); - if (dim == 0) { - isl_multi_val *mv; - - space = isl_multi_union_pw_aff_get_space(prefix); - space = isl_space_set_tuple_id(space, isl_dim_set, id); - mv = isl_multi_val_zero(space); - domain = isl_union_set_copy(domain); - return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv); - } - prefix = isl_multi_union_pw_aff_copy(prefix); - prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id); - return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix); -} - -/* Extend "grouping" with groups corresponding to merged - * leaves in the list of potentially merged leaves "leaves". - * - * The "list" field of each element in "leaves" contains a list - * of the instances sets of the original leaves that have been - * merged into this element. If at least two of the original leaves - * have been merged into a given element, then add the corresponding - * group to "grouping". - * In particular, the domain is extended with the statement instances - * of the merged leaves, the contraction is extended with a mapping - * of these statement instances to instances of a new group and - * the schedule is extended with a schedule that executes - * the statement instances according to the order of the leaves - * in which they appear. - * Since the instances of the groups should already be scheduled apart - * in the schedule into which this schedule will be plugged in, - * the schedules of the individual groups are combined independently - * of each other (as a set). - */ -static isl_stat add_groups(struct ppcg_grouping *grouping, - int n, struct ppcg_grouping_leaf leaves[]) -{ - int i; - - for (i = 0; i < n; ++i) { - int n_leaf; - isl_schedule *schedule; - isl_union_set *domain; - isl_union_pw_multi_aff *upma; - - n_leaf = isl_union_set_list_n_union_set(leaves[i].list); - if (n_leaf < 0) - return isl_stat_error; - if (n_leaf <= 1) - continue; - schedule = schedule_from_domain_and_list(leaves[i].domain, - leaves[i].list); - upma = group_contraction_from_prefix_and_domain(grouping, - leaves[i].prefix, leaves[i].domain); - - domain = isl_union_set_copy(leaves[i].domain); - if (grouping->domain) { - domain = isl_union_set_union(domain, grouping->domain); - upma = isl_union_pw_multi_aff_union_add(upma, - grouping->contraction); - schedule = isl_schedule_set(schedule, - grouping->schedule); - } - grouping->domain = domain; - grouping->contraction = upma; - grouping->schedule = schedule; - - if (!grouping->domain || !grouping->contraction || - !grouping->schedule) - return isl_stat_error; - } - - return isl_stat_ok; -} - -/* Look for any pairs of consecutive leaves among the "n" children of "node" - * starting at "first" that should be merged together. - * Store the results in "grouping". - * - * First make sure the intersection of validity and proximity - * schedule constraints is available and extract the required - * information from the "n" leaves. - * Then try and merge consecutive leaves based on the validity - * and proximity constraints. - * If any pairs were successfully merged, then add groups - * corresponding to the merged leaves to "grouping". - */ -static isl_stat group_subsequence(__isl_keep isl_schedule_node *node, - int first, int n, struct ppcg_grouping *grouping) -{ - int n_merge; - struct ppcg_grouping_leaf *leaves; - - if (ppcg_grouping_compute_dep(grouping) < 0) - return isl_stat_error; - - leaves = extract_leaves(node, first, n); - if (!leaves) - return isl_stat_error; - - n_merge = merge_leaves(n, leaves, grouping->dep); - if (n_merge >= 0 && n_merge < n && - add_groups(grouping, n_merge, leaves) < 0) - return isl_stat_error; - - ppcg_grouping_leaf_free(n, leaves); - - return isl_stat_ok; -} - -/* If "node" is a sequence, then check if it has any consecutive - * leaves that should be merged together and store the results - * in "grouping". - * - * In particular, call group_subsequence on each consecutive - * sequence of (filtered) leaves among the children of "node". - */ -static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user) -{ - int i, n, first; - struct ppcg_grouping *grouping = user; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence) - return isl_bool_true; - - n = isl_schedule_node_n_children(node); - if (n < 0) - return isl_bool_error; - - first = -1; - for (i = 0; i < n; ++i) { - isl_schedule_node *child; - enum isl_schedule_node_type type; - - child = isl_schedule_node_get_child(node, i); - child = isl_schedule_node_child(child, 0); - type = isl_schedule_node_get_type(child); - isl_schedule_node_free(child); - - if (first >= 0 && type != isl_schedule_node_leaf) { - if (group_subsequence(node, first, i - first, - grouping) < 0) - return isl_bool_error; - first = -1; - } - if (first < 0 && type == isl_schedule_node_leaf) - first = i; - } - if (first >= 0) { - if (group_subsequence(node, first, n - first, grouping) < 0) - return isl_bool_error; - } - - return isl_bool_true; -} - -/* Complete "grouping" to cover all statement instances in the domain - * of grouping->sc. - * - * In particular, grouping->domain is set to the full set of statement - * instances; group->contraction is extended with an identity - * contraction on the additional instances and group->schedule - * is extended with an independent schedule on those additional instances. - * In the extension of group->contraction, the additional instances - * are split into those belong to different statements and those - * that belong to some of the same statements. The first group - * is replaced by its universe in order to simplify the contraction extension. - */ -static void complete_grouping(struct ppcg_grouping *grouping) -{ - isl_union_set *domain, *left, *overlap; - isl_union_pw_multi_aff *upma; - isl_schedule *schedule; - - domain = isl_schedule_constraints_get_domain(grouping->sc); - left = isl_union_set_subtract(isl_union_set_copy(domain), - isl_union_set_copy(grouping->domain)); - schedule = isl_schedule_from_domain(isl_union_set_copy(left)); - schedule = isl_schedule_set(schedule, grouping->schedule); - grouping->schedule = schedule; - - overlap = isl_union_set_universe(grouping->domain); - grouping->domain = domain; - overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap); - left = isl_union_set_subtract(left, isl_union_set_copy(overlap)); - left = isl_union_set_universe(left); - left = isl_union_set_union(left, overlap); - upma = isl_union_set_identity_union_pw_multi_aff(left); - upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction); - grouping->contraction = upma; -} - -/* Compute a schedule on the domain of "sc" that respects the schedule - * constraints in "sc". - * - * "schedule" is a known correct schedule that is used to combine - * groups of statements if options->group_chains is set. - * In particular, statements that are executed consecutively in a sequence - * in this schedule and where all instances of the second depend on - * the instance of the first that is executed in the same iteration - * of outer band nodes are grouped together into a single statement. - * The schedule constraints are then mapped to these groups of statements - * and the resulting schedule is expanded again to refer to the original - * statements. - */ -__isl_give isl_schedule *ppcg_compute_schedule( - __isl_take isl_schedule_constraints *sc, - __isl_keep isl_schedule *schedule, struct ppcg_options *options) -{ - struct ppcg_grouping grouping = { sc }; - isl_union_pw_multi_aff *contraction; - isl_union_map *umap; - isl_schedule *res, *expansion; - - if (!options->group_chains) - return isl_schedule_constraints_compute_schedule(sc); - - grouping.group_id = 0; - if (isl_schedule_foreach_schedule_node_top_down(schedule, - &detect_groups, &grouping) < 0) - goto error; - if (!grouping.contraction) { - ppcg_grouping_clear(&grouping); - return isl_schedule_constraints_compute_schedule(sc); - } - complete_grouping(&grouping); - contraction = isl_union_pw_multi_aff_copy(grouping.contraction); - umap = isl_union_map_from_union_pw_multi_aff(contraction); - - sc = isl_schedule_constraints_apply(sc, umap); - - res = isl_schedule_constraints_compute_schedule(sc); - - contraction = isl_union_pw_multi_aff_copy(grouping.contraction); - expansion = isl_schedule_copy(grouping.schedule); - res = isl_schedule_expand(res, contraction, expansion); - - ppcg_grouping_clear(&grouping); - return res; -error: - ppcg_grouping_clear(&grouping); - isl_schedule_constraints_free(sc); - return NULL; -} diff --git a/polly/lib/External/ppcg/hybrid.h b/polly/lib/External/ppcg/hybrid.h deleted file mode 100644 --- a/polly/lib/External/ppcg/hybrid.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef HYBRID_H -#define HYBRID_H - -#include -#include - -#include "ppcg.h" - -struct ppcg_ht_bounds; -typedef struct ppcg_ht_bounds ppcg_ht_bounds; - -struct ppcg_ht_phase; -typedef struct ppcg_ht_phase ppcg_ht_phase; - -isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node); -isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node); - -__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop, - __isl_keep isl_schedule_node *node); -void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds); -isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds); -isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds, - __isl_keep isl_multi_val *sizes); -__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling( - __isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes, - __isl_take isl_schedule_node *node, struct ppcg_options *options); -__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free( - __isl_take ppcg_ht_bounds *bounds); - -__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark( - __isl_keep isl_schedule_node *node); -__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point( - __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node); -__isl_give isl_schedule_node *hybrid_tile_foreach_phase( - __isl_take isl_schedule_node *node, - __isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node, - void *user), void *user); -__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks( - __isl_take isl_schedule_node *node); - -#endif diff --git a/polly/lib/External/ppcg/hybrid.c b/polly/lib/External/ppcg/hybrid.c deleted file mode 100644 --- a/polly/lib/External/ppcg/hybrid.c +++ /dev/null @@ -1,2242 +0,0 @@ -/* - * Copyright 2013 Ecole Normale Superieure - * Copyright 2015 Sven Verdoolaege - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hybrid.h" -#include "schedule.h" - -/* The hybrid tiling implemented in this file is based on - * Grosser et al., "Hybrid Hexagonal/Classical Tiling for GPUs". - */ - -/* Bounds on relative dependence distances in input to hybrid tiling. - * upper is an upper bound on the relative dependence distances - * in the first space dimension - * -lower is a lower bound on the relative dependence distances - * in all space dimensions. - * - * In particular, - * - * d_i >= -lower_i d_0 - * and - * d_1 <= upper d_0 - * - * for each dependence distance vector d, where d_1 is the component - * corresponding to the first space dimension. - * - * upper and lower are always non-negative. - * Some of the values may be NaN if no bound could be found. - */ -struct ppcg_ht_bounds { - isl_val *upper; - isl_multi_val *lower; -}; - -/* Free "bounds" along with all its fields. - */ -__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free( - __isl_take ppcg_ht_bounds *bounds) -{ - if (!bounds) - return NULL; - isl_val_free(bounds->upper); - isl_multi_val_free(bounds->lower); - free(bounds); - - return NULL; -} - -/* Create a ppcg_ht_bounds object for a band living in "space". - * The bounds are initialized to NaN. - */ -__isl_give ppcg_ht_bounds *ppcg_ht_bounds_alloc(__isl_take isl_space *space) -{ - int i, n; - isl_ctx *ctx; - ppcg_ht_bounds *bounds; - - if (!space) - return NULL; - - ctx = isl_space_get_ctx(space); - bounds = isl_alloc_type(ctx, struct ppcg_ht_bounds); - if (!bounds) - goto error; - bounds->upper = isl_val_nan(ctx); - bounds->lower = isl_multi_val_zero(space); - n = isl_multi_val_dim(bounds->lower, isl_dim_set); - for (i = 0; i < n; ++i) { - isl_val *v = isl_val_copy(bounds->upper); - bounds->lower = isl_multi_val_set_val(bounds->lower, i, v); - } - - if (!bounds->lower || !bounds->upper) - return ppcg_ht_bounds_free(bounds); - - return bounds; -error: - isl_space_free(space); - return NULL; -} - -void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds) -{ - if (!bounds) - return; - - fprintf(stderr, "lower: "); - isl_multi_val_dump(bounds->lower); - fprintf(stderr, "upper: "); - isl_val_dump(bounds->upper); -} - -/* Return the upper bound on the relative dependence distances - * in the first space dimension. - */ -__isl_give isl_val *ppcg_ht_bounds_get_upper(__isl_keep ppcg_ht_bounds *bounds) -{ - if (!bounds) - return NULL; - return isl_val_copy(bounds->upper); -} - -/* Replace the upper bound on the relative dependence distances - * in the first space dimension by "upper". - */ -__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_upper( - __isl_take ppcg_ht_bounds *bounds, __isl_take isl_val *upper) -{ - if (!bounds || !upper) - goto error; - isl_val_free(bounds->upper); - bounds->upper = upper; - return bounds; -error: - ppcg_ht_bounds_free(bounds); - isl_val_free(upper); - return NULL; -} - -/* Return the lower bound on the relative dependence distances - * in space dimension "pos". - */ -__isl_give isl_val *ppcg_ht_bounds_get_lower(__isl_keep ppcg_ht_bounds *bounds, - int pos) -{ - if (!bounds) - return NULL; - return isl_multi_val_get_val(bounds->lower, pos); -} - -/* Replace the lower bound on the relative dependence distances - * in space dimension "pos" by "lower". - */ -__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_lower( - __isl_take ppcg_ht_bounds *bounds, int pos, __isl_take isl_val *lower) -{ - if (!bounds || !lower) - goto error; - bounds->lower = isl_multi_val_set_val(bounds->lower, pos, lower); - if (!bounds->lower) - return ppcg_ht_bounds_free(bounds); - return bounds; -error: - ppcg_ht_bounds_free(bounds); - isl_val_free(lower); - return NULL; -} - -/* Can the bounds on relative dependence distances recorded in "bounds" - * be used to perform hybrid tiling? - * In particular, have appropriate lower and upper bounds been found? - * Any NaN indicates that no corresponding bound was found. - */ -isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds) -{ - isl_bool is_nan; - int i, n; - - if (!bounds) - return isl_bool_error; - is_nan = isl_val_is_nan(bounds->upper); - if (is_nan < 0) - return isl_bool_error; - if (is_nan) - return isl_bool_false; - - n = isl_multi_val_dim(bounds->lower, isl_dim_set); - for (i = 0; i < n; ++i) { - isl_val *v; - - v = isl_multi_val_get_val(bounds->lower, i); - is_nan = isl_val_is_nan(v); - if (is_nan < 0) - return isl_bool_error; - if (is_nan) - return isl_bool_false; - isl_val_free(v); - } - - return isl_bool_true; -} - -/* Structure that represents the basic hexagonal tiling, - * along with information that is needed to perform the hybrid tiling. - * - * "bounds" are the bounds on the dependence distances that - * define the hexagonal shape and the required skewing in the remaining - * space dimensions. - * - * "input_node" points to the input pair of band nodes. - * "input_schedule" is the partial schedule of this input pair of band nodes. - * The space of this schedule is [P -> C], where P is the space - * of the parent node and C is the space of the child node. - * - * "space_sizes" represent the total size of a tile for the space - * dimensions, i.e., those corresponding to the child node. - * The space of "space_sizes" is C. - * If S_0 is the original tile size in the first space dimension, - * then the first entry of "space_sizes" is equal to - * W = 2*S_0 + floor(d_l h) + floor(d_u h). - * The remaining entries are the same as in the original tile sizes. - * - * The basic hexagonal tiling "hex" is defined - * in a "ts" (time-space) space and corresponds to the phase-1 tiles. - * "time_tile" maps the "ts" space to outer time tile. - * Is is equal to ts[t, s] -> floor(t/(2 * S_t)), with S_t the original tile - * size corresponding to the parent node. - * "local_time" maps the "ts" space to the time dimension inside each tile. - * It is equal to ts[t, s] -> t mod (2 S_t), with S_t the original tile - * size corresponding to the parent node. - * "shift_space" shifts the tiles at time tile T = floor(t/(2 S_t)) - * in the space dimension such that they align to a multiple of W. - * It is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W, - * with shift_s = S_0 + floor(d_u h). - * "shift_phase" is the shift taken to go from phase 0 to phase 1 - * It is equal to ts[t, s] -> ts[t + S_t, s + shift_s], - * with shift_s = S_0 + floor(d_u h). - * - * "project_ts" projects the space of the input schedule to the ts-space. - * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0]. - */ -struct ppcg_ht_tiling { - int ref; - - ppcg_ht_bounds *bounds; - isl_schedule_node *input_node; - isl_multi_union_pw_aff *input_schedule; - - isl_multi_val *space_sizes; - - isl_aff *time_tile; - isl_aff *local_time; - isl_aff *shift_space; - isl_multi_aff *shift_phase; - isl_set *hex; - - isl_multi_aff *project_ts; -}; -typedef struct ppcg_ht_tiling ppcg_ht_tiling; - -/* Return the space of the pair of band nodes that form the input - * to the hybrid tiling. - * In particular, return the space [P -> C], where P is the space - * of the parent node and C is the space of the child node. - */ -__isl_give isl_space *ppcg_ht_tiling_get_input_space( - __isl_keep ppcg_ht_tiling *tile) -{ - if (!tile) - return NULL; - - return isl_multi_union_pw_aff_get_space(tile->input_schedule); -} - -/* Remove a reference to "tile" and free "tile" along with all its fields - * as soon as the reference count drops to zero. - */ -static __isl_null ppcg_ht_tiling *ppcg_ht_tiling_free( - __isl_take ppcg_ht_tiling *tiling) -{ - if (!tiling) - return NULL; - if (--tiling->ref > 0) - return NULL; - - ppcg_ht_bounds_free(tiling->bounds); - isl_schedule_node_free(tiling->input_node); - isl_multi_union_pw_aff_free(tiling->input_schedule); - isl_multi_val_free(tiling->space_sizes); - isl_aff_free(tiling->time_tile); - isl_aff_free(tiling->local_time); - isl_aff_free(tiling->shift_space); - isl_multi_aff_free(tiling->shift_phase); - isl_set_free(tiling->hex); - isl_multi_aff_free(tiling->project_ts); - free(tiling); - - return NULL; -} - -/* Return a new reference to "tiling". - */ -__isl_give ppcg_ht_tiling *ppcg_ht_tiling_copy( - __isl_keep ppcg_ht_tiling *tiling) -{ - if (!tiling) - return NULL; - - tiling->ref++; - return tiling; -} - -/* Return the isl_ctx to which "tiling" belongs. - */ -isl_ctx *ppcg_ht_tiling_get_ctx(__isl_keep ppcg_ht_tiling *tiling) -{ - if (!tiling) - return NULL; - - return isl_multi_union_pw_aff_get_ctx(tiling->input_schedule); -} - -/* Representation of one of the two phases of hybrid tiling. - * - * "tiling" points to the shared tiling data. - * - * "time_tile", "local_time" and "shift_space" are equal to the corresponding - * fields of "tiling", pulled back to the input space. - * In case of phase 0, these expressions have also been moved - * from phase 1 to phase 0. - * - * "domain" contains the hexagonal tiling of this phase. - * - * "space_shift" is the shift that should be added to the space band - * in order to be able to apply rectangular tiling to the space. - * For phase 1, it is equal to - * - * [P[t] -> C[s_0, s_i]] -> C[(-(2 * shift_s)*T) % W, dl_i * u] - * - * with shift_s = S_0 + floor(d_u h), - * T equal to "time_tile" and u equal to "local_time". - * For phase 0, it is equal to - * - * [P[t] -> C[s_0, s_i]] -> C[shift_s + (-(2 * shift_s)*T) % W, dl_i * u] - * - * "space_tile" is the space tiling. It is equal to - * - * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size] - */ -struct ppcg_ht_phase { - ppcg_ht_tiling *tiling; - - isl_aff *time_tile; - isl_aff *local_time; - isl_aff *shift_space; - isl_set *domain; - - isl_multi_aff *space_shift; - isl_multi_aff *space_tile; -}; - -/* Free "phase" along with all its fields. - */ -static __isl_null ppcg_ht_phase *ppcg_ht_phase_free( - __isl_take ppcg_ht_phase *phase) -{ - if (!phase) - return NULL; - - ppcg_ht_tiling_free(phase->tiling); - isl_aff_free(phase->time_tile); - isl_aff_free(phase->local_time); - isl_aff_free(phase->shift_space); - isl_set_free(phase->domain); - isl_multi_aff_free(phase->space_shift); - isl_multi_aff_free(phase->space_tile); - free(phase); - - return NULL; -} - -/* Wrapper around ppcg_ht_phase_free for use as an argument - * to isl_id_set_free_user. - */ -static void ppcg_ht_phase_free_wrap(void *user) -{ - ppcg_ht_phase *phase = user; - - ppcg_ht_phase_free(phase); -} - -/* Return the domain of hybrid tiling phase "phase". - */ -static __isl_give isl_set *ppcg_ht_phase_get_domain(ppcg_ht_phase *phase) -{ - if (!phase) - return NULL; - - return isl_set_copy(phase->domain); -} - -/* Return the space of the pair of band nodes that form the input - * to the hybrid tiling of which "phase" is a phase. - * In particular, return the space [P -> C], where P is the space - * of the parent node and C is the space of the child node. - */ -static __isl_give isl_space *ppcg_ht_phase_get_input_space( - __isl_keep ppcg_ht_phase *phase) -{ - if (!phase) - return NULL; - - return ppcg_ht_tiling_get_input_space(phase->tiling); -} - -/* Construct the lower left constraint of the hexagonal tile, i.e., - * - * du a - b <= (2h+1) du - duh - * -du a + b + (2h+1) du - duh >= 0 - * - * where duh = floor(du * h). - * - * This constraint corresponds to (6) in - * "Hybrid Hexagonal/Classical Tiling for GPUs". - */ -static __isl_give isl_constraint *hex_lower_left(__isl_take isl_local_space *ls, - __isl_keep isl_val *h, __isl_keep isl_val *du, __isl_keep isl_val *duh) -{ - isl_val *v; - isl_aff *aff; - - v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); - v = isl_val_mul(v, isl_val_copy(du)); - v = isl_val_sub(v, isl_val_copy(duh)); - aff = isl_aff_val_on_domain(ls, v); - v = isl_val_neg(isl_val_copy(du)); - aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v); - aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1); - - return isl_inequality_from_aff(aff); -} - -/* Construct the lower constraint of the hexagonal tile, i.e., - * - * a <= 2h+1 - * -a + 2h+1 >= 0 - * - * This constraint corresponds to (7) in - * "Hybrid Hexagonal/Classical Tiling for GPUs". - */ -static __isl_give isl_constraint *hex_lower(__isl_take isl_local_space *ls, - __isl_keep isl_val *h) -{ - isl_val *v; - isl_aff *aff; - - v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); - aff = isl_aff_val_on_domain(ls, v); - aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 0, -1); - - return isl_inequality_from_aff(aff); -} - -/* Construct the lower right constraint of the hexagonal tile, i.e., - * - * dl a + b <= (2h+1) dl + duh + (s0-1) - * -dl a - b + (2h+1) dl + duh + (s0-1) >= 0 - * - * where duh = floor(du * h). - * - * This constraint corresponds to (8) in - * "Hybrid Hexagonal/Classical Tiling for GPUs". - */ -static __isl_give isl_constraint *hex_lower_right( - __isl_take isl_local_space *ls, __isl_keep isl_val *h, - __isl_keep isl_val *s0, __isl_keep isl_val *dl, __isl_keep isl_val *duh) -{ - isl_val *v; - isl_aff *aff; - - v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); - v = isl_val_mul(v, isl_val_copy(dl)); - v = isl_val_add(v, isl_val_copy(duh)); - v = isl_val_add(v, isl_val_copy(s0)); - v = isl_val_sub_ui(v, 1); - aff = isl_aff_val_on_domain(ls, v); - v = isl_val_neg(isl_val_copy(dl)); - aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v); - aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1); - - return isl_inequality_from_aff(aff); -} - -/* Construct the upper left constraint of the hexagonal tile, i.e., - * - * dl a + b >= h dl - (d - 1)/d with d = den(dl) - * dl a + b - h dl + (d - 1)/d >= 0 - * - * This constraint corresponds to (10) in - * "Hybrid Hexagonal/Classical Tiling for GPUs". - */ -static __isl_give isl_constraint *hex_upper_left(__isl_take isl_local_space *ls, - __isl_keep isl_val *h, __isl_keep isl_val *dl) -{ - isl_val *v, *d; - isl_aff *aff; - - d = isl_val_get_den_val(dl); - v = isl_val_sub_ui(isl_val_copy(d), 1); - v = isl_val_div(v, d); - v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(dl))); - aff = isl_aff_val_on_domain(ls, v); - aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(dl)); - aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1); - - return isl_inequality_from_aff(aff); -} - -/* Construct the upper right constraint of the hexagonal tile, i.e., - * - * du a - b >= du h - duh - (s0-1) - dlh - (d - 1)/d with d = den(du) - * du a - b - du h + duh + (s0-1) + dlh + (d - 1)/d >= 0 - * - * where dlh = floor(dl * h) and duh = floor(du * h). - * - * This constraint corresponds to (12) in - * "Hybrid Hexagonal/Classical Tiling for GPUs". - */ -static __isl_give isl_constraint *hex_upper_right( - __isl_take isl_local_space *ls, __isl_keep isl_val *h, - __isl_keep isl_val *s0, __isl_keep isl_val *du, - __isl_keep isl_val *dlh, __isl_keep isl_val *duh) -{ - isl_val *v, *d; - isl_aff *aff; - - d = isl_val_get_den_val(du); - v = isl_val_sub_ui(isl_val_copy(d), 1); - v = isl_val_div(v, d); - v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(du))); - v = isl_val_add(v, isl_val_copy(duh)); - v = isl_val_add(v, isl_val_copy(dlh)); - v = isl_val_add(v, isl_val_copy(s0)); - v = isl_val_sub_ui(v, 1); - aff = isl_aff_val_on_domain(ls, v); - aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(du)); - aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1); - - return isl_inequality_from_aff(aff); -} - -/* Construct the uppper constraint of the hexagonal tile, i.e., - * - * a >= 0 - * - * This constraint corresponds to (13) in - * "Hybrid Hexagonal/Classical Tiling for GPUs". - */ -static __isl_give isl_constraint *hex_upper(__isl_take isl_local_space *ls) -{ - isl_aff *aff; - - aff = isl_aff_var_on_domain(ls, isl_dim_set, 0); - - return isl_inequality_from_aff(aff); -} - -/* Construct the basic hexagonal tile shape. - * "space" is the 2D space in which the hexagon should be constructed. - * h is st-1, with st the tile size in the time dimension - * s0 is the tile size in the space dimension - * dl is a bound on the negative relative dependence distances, i.e., - * - * d_s >= -dl d_t - * - * du is a bound on the positive relative dependence distances, i.e., - * - * d_s <= du d_t - * - * with (d_t,d_s) any dependence distance vector. - * dlh = floor(dl * h) - * duh = floor(du * h) - * - * The shape of the hexagon is as follows: - * - * 0 dlh dlh+s0-1 - * ______ __ - * 0 / \_ / - * / \_ / - * h / \ ______ / - * h+1 \_ // \\_ - * \_ // \\_ - * 2h+1 \______// \\ - * 0 duh duh+s0-1 - * duh+s0-1+dlh - * duh+s0-1+dlh+1+s0+1 - * - * The next hexagon is shifted by duh + dlh + 2 * s0. - * - * The slope of the "/" constraints is dl. - * The slope of the "\_" constraints is du. - */ -static __isl_give isl_set *compute_hexagon(__isl_take isl_space *space, - __isl_keep isl_val *h, __isl_keep isl_val *s0, - __isl_keep isl_val *dl, __isl_keep isl_val *du, - __isl_keep isl_val *dlh, __isl_keep isl_val *duh) -{ - isl_local_space *ls; - isl_constraint *c; - isl_basic_set *bset; - - ls = isl_local_space_from_space(space); - - c = hex_lower_left(isl_local_space_copy(ls), h, du, duh); - bset = isl_basic_set_from_constraint(c); - - c = hex_lower(isl_local_space_copy(ls), h); - bset = isl_basic_set_add_constraint(bset, c); - - c = hex_lower_right(isl_local_space_copy(ls), h, s0, dl, duh); - bset = isl_basic_set_add_constraint(bset, c); - - c = hex_upper_left(isl_local_space_copy(ls), h, dl); - bset = isl_basic_set_add_constraint(bset, c); - - c = hex_upper_right(isl_local_space_copy(ls), h, s0, du, dlh, duh); - bset = isl_basic_set_add_constraint(bset, c); - - c = hex_upper(ls); - bset = isl_basic_set_add_constraint(bset, c); - - return isl_set_from_basic_set(bset); -} - -/* Name of the ts-space. - */ -static const char *ts_space_name = "ts"; - -/* Construct and return the space ts[t, s]. - */ -static __isl_give isl_space *construct_ts_space(isl_ctx *ctx) -{ - isl_space *s; - - s = isl_space_set_alloc(ctx, 0, 2); - s = isl_space_set_tuple_name(s, isl_dim_set, ts_space_name); - - return s; -} - -/* Name of the local ts-space. - */ -static const char *local_ts_space_name = "local_ts"; - -/* Construct and return the space local_ts[t, s]. - */ -static __isl_give isl_space *construct_local_ts_space(isl_ctx *ctx) -{ - isl_space *s; - - s = isl_space_set_alloc(ctx, 0, 2); - s = isl_space_set_tuple_name(s, isl_dim_set, local_ts_space_name); - - return s; -} - -/* Compute the total size of a tile for the space dimensions, - * i.e., those corresponding to the child node - * of the input pattern. - * If S_0 is the original tile size in the first space dimension, - * then the first entry of "space_sizes" is equal to - * W = 2*S_0 + floor(d_l h) + floor(d_u h). - * The remaining entries are the same as in the original tile sizes. - * "tile_sizes" contains the original tile sizes, including - * the tile size corresponding to the parent node. - * "dlh" is equal to floor(d_l h). - * "duh" is equal to floor(d_u h). - */ -static __isl_give isl_multi_val *compute_space_sizes( - __isl_keep isl_multi_val *tile_sizes, - __isl_keep isl_val *dlh, __isl_keep isl_val *duh) -{ - isl_val *size; - isl_multi_val *space_sizes; - - space_sizes = isl_multi_val_copy(tile_sizes); - space_sizes = isl_multi_val_factor_range(space_sizes); - size = isl_multi_val_get_val(space_sizes, 0); - size = isl_val_mul_ui(size, 2); - size = isl_val_add(size, isl_val_copy(duh)); - size = isl_val_add(size, isl_val_copy(dlh)); - space_sizes = isl_multi_val_set_val(space_sizes, 0, size); - - return space_sizes; -} - -/* Compute the offset of phase 1 with respect to phase 0 - * in the ts-space ("space"). - * In particular, return - * - * ts[st, s0 + duh] - */ -static __isl_give isl_multi_val *compute_phase_shift( - __isl_keep isl_space *space, __isl_keep isl_val *st, - __isl_keep isl_val *s0, __isl_keep isl_val *duh) -{ - isl_val *v; - isl_multi_val *phase_shift; - - phase_shift = isl_multi_val_zero(isl_space_copy(space)); - phase_shift = isl_multi_val_set_val(phase_shift, 0, isl_val_copy(st)); - v = isl_val_add(isl_val_copy(duh), isl_val_copy(s0)); - phase_shift = isl_multi_val_set_val(phase_shift, 1, v); - - return phase_shift; -} - -/* Return the function - * - * ts[t, s] -> floor(t/(2 * st)) - * - * representing the time tile. - * "space" is the space ts[t, s]. - */ -static __isl_give isl_aff *compute_time_tile(__isl_keep isl_space *space, - __isl_keep isl_val *st) -{ - isl_val *v; - isl_aff *t; - isl_local_space *ls; - - ls = isl_local_space_from_space(isl_space_copy(space)); - t = isl_aff_var_on_domain(ls, isl_dim_set, 0); - v = isl_val_mul_ui(isl_val_copy(st), 2); - t = isl_aff_floor(isl_aff_scale_down_val(t, v)); - - return t; -} - -/* Compute a shift in the space dimension for tiles - * at time tile T = floor(t/(2 * S_t)) - * such that they align to a multiple of the total space tile dimension W. - * In particular, compute - * - * ts[t, s] -> s + (-(2 * shift_s)*T) % W - * - * where shift_s is the shift of phase 1 with respect to phase 0 - * in the space dimension (the first element of "phase_shift"). - * W is stored in the first element of "space_sizes". - * "time_tile" is the function - * - * ts[t, s] -> floor(t/(2 * S_T)) - * - * Since phase 1 is shifted by shift_s with respect to phase 0, - * the next line of phase 0 (at T+1) is shifted by 2*shift_s - * with respect to the previous line (at T). - * A shift of -(2 * shift_s)*T therefore allows the basic pattern - * (which starts at 0) to be applied. - * However, this shift will be used to obtain the tile coordinate - * in the first space dimension and if the original values - * in the space dimension are non-negative, then the shift should - * not make them negative. Moreover, the shift should be as minimal - * as possible. - * Since the pattern repeats itself with a period of W in the space - * dimension, the shift can be replaced by (-(2 * shift_s)*T) % W. - */ -static __isl_give isl_aff *compute_shift_space(__isl_keep isl_aff *time_tile, - __isl_keep isl_multi_val *space_sizes, - __isl_keep isl_multi_val *phase_shift) -{ - isl_val *v; - isl_aff *s, *t; - isl_local_space *ls; - - ls = isl_local_space_from_space(isl_aff_get_domain_space(time_tile)); - t = isl_aff_copy(time_tile); - v = isl_val_mul_ui(isl_multi_val_get_val(phase_shift, 1), 2); - v = isl_val_neg(v); - t = isl_aff_scale_val(t, v); - v = isl_multi_val_get_val(space_sizes, 0); - t = isl_aff_mod_val(t, v); - s = isl_aff_var_on_domain(ls, isl_dim_set, 1); - s = isl_aff_add(s, t); - - return s; -} - -/* Give the phase_shift ts[S_t, S_0 + floor(d_u h)], - * compute a function that applies the shift, i.e., - * - * ts[t, s] -> ts[t + S_t, s + S_0 + floor(d_u h)], - */ -static __isl_give isl_multi_aff *compute_shift_phase( - __isl_keep isl_multi_val *phase_shift) -{ - isl_space *space; - isl_multi_aff *shift; - - space = isl_multi_val_get_space(phase_shift); - shift = isl_multi_aff_multi_val_on_space(space, - isl_multi_val_copy(phase_shift)); - space = isl_multi_aff_get_space(shift); - shift = isl_multi_aff_add(shift, isl_multi_aff_identity(space)); - - return shift; -} - -/* Compute a mapping from the ts-space to the local coordinates - * within each tile. In particular, compute - * - * ts[t, s] -> local_ts[t % (2 S_t), (s + (-(2 * shift_s)*T) % W) % W] - * - * "ts" is the space ts[t, s] - * "local_ts" is the space local_ts[t, s] - * "shift_space" is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W - * "st" is the tile size in the time dimension S_t. - * The first element of "space_sizes" is equal to W. - */ -static __isl_give isl_multi_aff *compute_localize( - __isl_keep isl_space *local_ts, __isl_keep isl_aff *shift_space, - __isl_keep isl_val *st, __isl_keep isl_multi_val *space_sizes) -{ - isl_val *v; - isl_space *space; - isl_aff *s, *t; - isl_multi_aff *localize; - - space = isl_aff_get_domain_space(shift_space); - local_ts = isl_space_copy(local_ts); - space = isl_space_map_from_domain_and_range(space, local_ts); - localize = isl_multi_aff_identity(space); - t = isl_multi_aff_get_aff(localize, 0); - v = isl_val_mul_ui(isl_val_copy(st), 2); - t = isl_aff_mod_val(t, v); - localize = isl_multi_aff_set_aff(localize, 0, t); - s = isl_aff_copy(shift_space); - v = isl_multi_val_get_val(space_sizes, 0); - s = isl_aff_mod_val(s, v); - localize = isl_multi_aff_set_aff(localize, 1, s); - - return localize; -} - -/* Set the project_ts field of "tiling". - * - * This field projects the space of the input schedule to the ts-space. - * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0]. - */ -static __isl_give ppcg_ht_tiling *ppcg_ht_tiling_set_project_ts( - __isl_take ppcg_ht_tiling *tiling) -{ - int n; - isl_space *space; - isl_multi_aff *project; - - if (!tiling) - return NULL; - - space = ppcg_ht_tiling_get_input_space(tiling); - n = isl_space_dim(space, isl_dim_set); - project = isl_multi_aff_project_out_map(space, isl_dim_set, 2, n - 2); - project = isl_multi_aff_set_tuple_name(project, - isl_dim_out, ts_space_name); - if (!project) - return ppcg_ht_tiling_free(tiling); - - tiling->project_ts = project; - - return tiling; -} - -/* Construct a hybrid tiling description from bounds on the dependence - * distances "bounds". - * "input_node" points to the original parent node. - * "input_schedule" is the combined schedule of the parent and child - * node in the input. - * "tile_sizes" are the original, user specified tile sizes. - */ -static __isl_give ppcg_ht_tiling *ppcg_ht_bounds_construct_tiling( - __isl_take ppcg_ht_bounds *bounds, - __isl_keep isl_schedule_node *input_node, - __isl_keep isl_multi_union_pw_aff *input_schedule, - __isl_keep isl_multi_val *tile_sizes) -{ - isl_ctx *ctx; - ppcg_ht_tiling *tiling; - isl_multi_val *space_sizes, *phase_shift; - isl_aff *time_tile, *shift_space; - isl_multi_aff *localize; - isl_val *h, *duh, *dlh; - isl_val *st, *s0, *du, *dl; - isl_space *ts, *local_ts; - - if (!bounds || !input_node || !input_schedule || !tile_sizes) - goto error; - - ctx = isl_multi_union_pw_aff_get_ctx(input_schedule); - tiling = isl_calloc_type(ctx, struct ppcg_ht_tiling); - if (!tiling) - goto error; - tiling->ref = 1; - - st = isl_multi_val_get_val(tile_sizes, 0); - h = isl_val_sub_ui(isl_val_copy(st), 1); - s0 = isl_multi_val_get_val(tile_sizes, 1); - du = ppcg_ht_bounds_get_upper(bounds); - dl = ppcg_ht_bounds_get_lower(bounds, 0); - - duh = isl_val_floor(isl_val_mul(isl_val_copy(du), isl_val_copy(h))); - dlh = isl_val_floor(isl_val_mul(isl_val_copy(dl), isl_val_copy(h))); - - ts = construct_ts_space(ctx); - local_ts = construct_local_ts_space(ctx); - - space_sizes = compute_space_sizes(tile_sizes, dlh, duh); - phase_shift = compute_phase_shift(ts, st, s0, duh); - time_tile = compute_time_tile(ts, st); - shift_space = compute_shift_space(time_tile, space_sizes, phase_shift); - localize = compute_localize(local_ts, shift_space, st, space_sizes); - isl_space_free(ts); - - tiling->input_node = isl_schedule_node_copy(input_node); - tiling->input_schedule = isl_multi_union_pw_aff_copy(input_schedule); - tiling->space_sizes = space_sizes; - tiling->bounds = bounds; - tiling->local_time = isl_multi_aff_get_aff(localize, 0); - tiling->hex = compute_hexagon(local_ts, h, s0, dl, du, dlh, duh); - tiling->hex = isl_set_preimage_multi_aff(tiling->hex, localize); - tiling->time_tile = time_tile; - tiling->shift_space = shift_space; - tiling->shift_phase = compute_shift_phase(phase_shift); - isl_multi_val_free(phase_shift); - - isl_val_free(duh); - isl_val_free(dlh); - isl_val_free(du); - isl_val_free(dl); - isl_val_free(s0); - isl_val_free(st); - isl_val_free(h); - - if (!tiling->input_schedule || !tiling->local_time || !tiling->hex || - !tiling->shift_space || !tiling->shift_phase) - return ppcg_ht_tiling_free(tiling); - - tiling = ppcg_ht_tiling_set_project_ts(tiling); - - return tiling; -error: - ppcg_ht_bounds_free(bounds); - return NULL; -} - -/* Are all members of the band node "node" coincident? - */ -static isl_bool all_coincident(__isl_keep isl_schedule_node *node) -{ - int i, n; - - n = isl_schedule_node_band_n_member(node); - for (i = 0; i < n; ++i) { - isl_bool c; - - c = isl_schedule_node_band_member_get_coincident(node, i); - if (c < 0 || !c) - return c; - } - - return isl_bool_true; -} - -/* Does "node" satisfy the properties of the inner node in the input - * pattern for hybrid tiling? - * That is, is it a band node with only coincident members, of which - * there is at least one? - */ -static isl_bool has_child_properties(__isl_keep isl_schedule_node *node) -{ - if (!node) - return isl_bool_error; - if (isl_schedule_node_get_type(node) != isl_schedule_node_band) - return isl_bool_false; - if (isl_schedule_node_band_n_member(node) < 1) - return isl_bool_false; - return all_coincident(node); -} - -/* Does "node" satisfy the properties of the outer node in the input - * pattern for hybrid tiling? - * That is, is it a band node with a single member? - */ -static isl_bool has_parent_properties(__isl_keep isl_schedule_node *node) -{ - if (!node) - return isl_bool_error; - if (isl_schedule_node_get_type(node) != isl_schedule_node_band) - return isl_bool_false; - if (isl_schedule_node_band_n_member(node) != 1) - return isl_bool_false; - return isl_bool_true; -} - -/* Does the parent of "node" satisfy the input patttern for hybrid tiling? - * That is, does "node" satisfy the properties of the inner node and - * does the parent of "node" satisfy the properties of the outer node? - */ -isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node) -{ - isl_bool has_pattern; - - has_pattern = has_child_properties(node); - if (has_pattern < 0 || !has_pattern) - return has_pattern; - - node = isl_schedule_node_copy(node); - node = isl_schedule_node_parent(node); - has_pattern = has_parent_properties(node); - isl_schedule_node_free(node); - - return has_pattern; -} - -/* Does "node" satisfy the input patttern for hybrid tiling? - * That is, does "node" satisfy the properties of the outer node and - * does the child of "node" satisfy the properties of the inner node? - */ -isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node) -{ - isl_bool has_pattern; - - has_pattern = has_parent_properties(node); - if (has_pattern < 0 || !has_pattern) - return has_pattern; - - node = isl_schedule_node_get_child(node, 0); - has_pattern = has_child_properties(node); - isl_schedule_node_free(node); - - return has_pattern; -} - -/* Check that "node" satisfies the input pattern for hybrid tiling. - * Error out if it does not. - */ -static isl_stat check_input_pattern(__isl_keep isl_schedule_node *node) -{ - isl_bool has_pattern; - - has_pattern = ppcg_ht_has_input_pattern(node); - if (has_pattern < 0) - return isl_stat_error; - if (!has_pattern) - isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid, - "invalid input pattern for hybrid tiling", - return isl_stat_error); - - return isl_stat_ok; -} - -/* Extract the input schedule from "node", i.e., the product - * of the partial schedules of the parent and child nodes - * in the input pattern. - */ -static __isl_give isl_multi_union_pw_aff *extract_input_schedule( - __isl_keep isl_schedule_node *node) -{ - isl_multi_union_pw_aff *partial, *partial2; - - partial = isl_schedule_node_band_get_partial_schedule(node); - node = isl_schedule_node_get_child(node, 0); - partial2 = isl_schedule_node_band_get_partial_schedule(node); - isl_schedule_node_free(node); - - return isl_multi_union_pw_aff_range_product(partial, partial2); -} - -/* Collect all dependences from "scop" that are relevant for performing - * hybrid tiling on "node" and its child and map them to the schedule - * space of this pair of nodes. - * - * In case live range reordering is not used, - * the flow and the false dependences are collected. - * In case live range reordering is used, - * the flow and the forced dependences are collected, as well - * as the order dependences that are adjacent to non-local - * flow dependences. - * - * In all cases, only dependences that map to the same instance - * of the outer part of the schedule are considered. - */ -static __isl_give isl_map *collect_deps(struct ppcg_scop *scop, - __isl_keep isl_schedule_node *node) -{ - isl_space *space; - isl_multi_union_pw_aff *prefix, *partial; - isl_union_map *flow, *other, *dep, *umap; - isl_map *map; - - prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); - partial = extract_input_schedule(node); - space = isl_multi_union_pw_aff_get_space(partial); - - flow = isl_union_map_copy(scop->dep_flow); - flow = isl_union_map_eq_at_multi_union_pw_aff(flow, - isl_multi_union_pw_aff_copy(prefix)); - if (!scop->options->live_range_reordering) { - other = isl_union_map_copy(scop->dep_false); - other = isl_union_map_eq_at_multi_union_pw_aff(other, prefix); - } else { - isl_union_map *local, *non_local, *order, *adj; - isl_union_set *domain, *range; - - other = isl_union_map_copy(scop->dep_forced); - other = isl_union_map_eq_at_multi_union_pw_aff(other, - isl_multi_union_pw_aff_copy(prefix)); - local = isl_union_map_copy(flow); - local = isl_union_map_eq_at_multi_union_pw_aff(local, - isl_multi_union_pw_aff_copy(partial)); - non_local = isl_union_map_copy(flow); - non_local = isl_union_map_subtract(non_local, local); - - order = isl_union_map_copy(scop->dep_order); - order = isl_union_map_eq_at_multi_union_pw_aff(order, prefix); - adj = isl_union_map_copy(order); - domain = isl_union_map_domain(isl_union_map_copy(non_local)); - domain = isl_union_set_coalesce(domain); - adj = isl_union_map_intersect_range(adj, domain); - other = isl_union_map_union(other, adj); - - adj = order; - range = isl_union_map_range(non_local); - range = isl_union_set_coalesce(range); - adj = isl_union_map_intersect_domain(adj, range); - other = isl_union_map_union(other, adj); - } - dep = isl_union_map_union(flow, other); - - umap = isl_union_map_from_multi_union_pw_aff(partial); - dep = isl_union_map_apply_domain(dep, isl_union_map_copy(umap)); - dep = isl_union_map_apply_range(dep, umap); - - space = isl_space_map_from_set(space); - map = isl_union_map_extract_map(dep, space); - isl_union_map_free(dep); - - map = isl_map_coalesce(map); - - return map; -} - -/* Given a constraint of the form - * - * a i_0 + b i_1 >= 0 - * or - * a i_0 + b i_1 = 0 - * - * use it to update one or both of the non-negative bounds - * in "list" = (min, max) such that - * - * i_1 >= -min i_0 - * and - * i_1 <= max i_0 - * - * If b = 0, then the constraint cannot be used. - * Otherwise, the constraint is equivalent to - * - * sgn(b) i_1 >= - a/abs(b) i_0 - * i.e., - * i_1 >= - a/abs(b) i_0 - * or - * i_1 <= a/abs(b) i_0 - * - * Set the first or second element of "list" to max(0, a/abs(b)), - * according to the sign of "b". Or set both in case the constraint - * is an equality, taking into account the sign change. - */ -static __isl_give isl_val_list *list_set_min_max(__isl_take isl_val_list *list, - __isl_keep isl_constraint *c) -{ - isl_val *a, *b; - int sign; - int pos; - isl_bool eq, is_zero, is_neg; - - eq = isl_constraint_is_equality(c); - if (eq < 0) - return isl_val_list_free(list); - - b = isl_constraint_get_coefficient_val(c, isl_dim_set, 1); - is_zero = isl_val_is_zero(b); - if (is_zero == isl_bool_true) { - isl_val_free(b); - return list; - } - a = isl_constraint_get_coefficient_val(c, isl_dim_set, 0); - sign = isl_val_sgn(b); - b = isl_val_abs(b); - a = isl_val_div(a, b); - - if (eq) - b = isl_val_copy(a); - - pos = sign > 0 ? 0 : 1; - is_neg = isl_val_is_neg(a); - if (is_neg == isl_bool_true) - a = isl_val_set_si(a, 0); - list = isl_val_list_set_val(list, pos, a); - - if (!eq) - return is_neg < 0 ? isl_val_list_free(list) : list; - - pos = 1 - pos; - a = isl_val_neg(b); - is_neg = isl_val_is_neg(a); - if (is_neg == isl_bool_true) - a = isl_val_set_si(a, 0); - list = isl_val_list_set_val(list, pos, a); - - return is_neg < 0 ? isl_val_list_free(list) : list; -} - -/* If constraint "c" passes through the origin, then try and use it - * to update the non-negative bounds in "list" = (min, max) such that - * - * i_1 >= -min i_0 - * and - * i_1 <= max i_0 - */ -static isl_stat set_min_max(__isl_take isl_constraint *c, void *user) -{ - isl_val *v; - isl_val_list **list = user; - isl_bool is_zero; - - v = isl_constraint_get_constant_val(c); - is_zero = isl_val_is_zero(v); - isl_val_free(v); - - if (is_zero == isl_bool_true) - *list = list_set_min_max(*list, c); - - isl_constraint_free(c); - return is_zero < 0 ? isl_stat_error : isl_stat_ok; -} - -/* Given a set of dependence distance vectors "dist", compute - * pair of non-negative bounds min and max such that - * - * d_pos >= -min d_0 - * and - * d_pos <= max d_0 - * - * and return the pair (min, max). - * If no bound can be found in either direction, then the bound - * is replaced by NaN. - * - * The dependence distances are first projected onto the (d_0, d_pos). - * Then the zero dependence distance is added and the convex hull is computed. - * Finally, the bounds are extracted from the constraints of the convex hull - * that pass through the origin. - */ -static __isl_give isl_val_list *min_max_dist(__isl_keep isl_set *dist, int pos) -{ - isl_space *space; - isl_basic_set *hull; - int dim; - isl_ctx *ctx; - isl_val *nan; - isl_val_list *list; - - ctx = isl_set_get_ctx(dist); - nan = isl_val_nan(ctx); - list = isl_val_list_alloc(ctx, 2); - list = isl_val_list_add(list, isl_val_copy(nan)); - list = isl_val_list_add(list, nan); - - dist = isl_set_copy(dist); - dim = isl_set_dim(dist, isl_dim_set); - if (dist && pos >= dim) - isl_die(ctx, isl_error_internal, "position out of bounds", - dist = isl_set_free(dist)); - dist = isl_set_project_out(dist, isl_dim_set, pos + 1, dim - (pos + 1)); - dist = isl_set_project_out(dist, isl_dim_set, 1, pos - 1); - - space = isl_set_get_space(dist); - dist = isl_set_union(dist, isl_set_from_point(isl_point_zero(space))); - dist = isl_set_remove_divs(dist); - hull = isl_set_convex_hull(dist); - - if (isl_basic_set_foreach_constraint(hull, &set_min_max, &list) < 0) - list = isl_val_list_free(list); - isl_basic_set_free(hull); - - return list; -} - -/* Given a schedule node "node" that, together with its child, - * satisfies the input pattern for hybrid tiling, compute bounds - * on the relative dependence distances of the child node with - * respect to the parent node. These bounds are needed to - * construct a hybrid tiling. - * - * First all relevant dependences are collected and mapped - * to the schedule space of the pair of nodes. Then, the - * dependence distances are computed in this space. - * - * These dependence distances are then projected onto a two-dimensional - * space consisting of the single schedule dimension of the outer node - * and one of the schedule dimensions of the inner node. - * The maximal and minimal relative dependence distances are extracted - * from these projections. - * This process is repeated for each of the schedule dimensions - * of the inner node. For the first dimension, both minimal and - * maximal relative dependence distances are stored in the result. - * For the other dimensions, only the minimal relative dependence - * distance is stored. - */ -__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop, - __isl_keep isl_schedule_node *node) -{ - ppcg_ht_bounds *bnd; - isl_space *space; - isl_map *map; - isl_set *dist; - isl_val_list *pair; - isl_schedule_node *child; - int n; - int i, dim; - - if (!scop || !node || check_input_pattern(node) < 0) - return NULL; - - child = isl_schedule_node_get_child(node, 0); - space = isl_schedule_node_band_get_space(child); - dim = isl_schedule_node_band_n_member(child); - isl_schedule_node_free(child); - bnd = ppcg_ht_bounds_alloc(space); - if (!bnd) - return NULL; - - map = collect_deps(scop, node); - - dist = isl_map_deltas(map); - n = isl_set_dim(dist, isl_dim_param); - dist = isl_set_project_out(dist, isl_dim_param, 0, n); - - pair = min_max_dist(dist, 1); - bnd = ppcg_ht_bounds_set_lower(bnd, 0, isl_val_list_get_val(pair, 0)); - bnd = ppcg_ht_bounds_set_upper(bnd, isl_val_list_get_val(pair, 1)); - isl_val_list_free(pair); - - for (i = 1; i < dim; ++i) { - pair = min_max_dist(dist, 1 + i); - bnd = ppcg_ht_bounds_set_lower(bnd, i, - isl_val_list_get_val(pair, 0)); - isl_val_list_free(pair); - } - - isl_set_free(dist); - - return bnd; -} - -/* Check if all the fields of "phase" are valid, freeing "phase" - * if they are not. - */ -static __isl_give ppcg_ht_phase *check_phase(__isl_take ppcg_ht_phase *phase) -{ - if (!phase) - return NULL; - - if (!phase->tiling || !phase->local_time || - !phase->shift_space || !phase->domain) - return ppcg_ht_phase_free(phase); - - return phase; -} - -/* Construct a ppcg_ht_phase object, that simply copies - * information from "tiling". - * That is, the result is defined over the "ts" space and - * corresponds to phase 1. - */ -static __isl_give ppcg_ht_phase *construct_phase( - __isl_keep ppcg_ht_tiling *tiling) -{ - isl_ctx *ctx; - ppcg_ht_phase *phase; - - if (!tiling) - return NULL; - - ctx = ppcg_ht_tiling_get_ctx(tiling); - phase = isl_calloc_type(ctx, struct ppcg_ht_phase); - if (!phase) - return NULL; - phase->tiling = ppcg_ht_tiling_copy(tiling); - phase->time_tile = isl_aff_copy(tiling->time_tile); - phase->local_time = isl_aff_copy(tiling->local_time); - phase->shift_space = isl_aff_copy(tiling->shift_space); - phase->domain = isl_set_copy(tiling->hex); - - return check_phase(phase); -} - -/* Align the parameters of the elements of "phase" to those of "space". - */ -static __isl_give ppcg_ht_phase *phase_align_params( - __isl_take ppcg_ht_phase *phase, __isl_take isl_space *space) -{ - if (!phase) - goto error; - - phase->time_tile = isl_aff_align_params(phase->time_tile, - isl_space_copy(space)); - phase->local_time = isl_aff_align_params(phase->local_time, - isl_space_copy(space)); - phase->shift_space = isl_aff_align_params(phase->shift_space, - isl_space_copy(space)); - phase->domain = isl_set_align_params(phase->domain, space); - - return check_phase(phase); -error: - isl_space_free(space); - return NULL; -} - -/* Pull back "phase" over "ma". - * That is, take a phase defined over the range of "ma" and - * turn it into a phase defined over the domain of "ma". - */ -static __isl_give ppcg_ht_phase *pullback_phase(__isl_take ppcg_ht_phase *phase, - __isl_take isl_multi_aff *ma) -{ - phase = phase_align_params(phase, isl_multi_aff_get_space(ma)); - if (!phase) - goto error; - - phase->time_tile = isl_aff_pullback_multi_aff(phase->time_tile, - isl_multi_aff_copy(ma)); - phase->local_time = isl_aff_pullback_multi_aff(phase->local_time, - isl_multi_aff_copy(ma)); - phase->shift_space = isl_aff_pullback_multi_aff(phase->shift_space, - isl_multi_aff_copy(ma)); - phase->domain = isl_set_preimage_multi_aff(phase->domain, ma); - - return check_phase(phase); -error: - isl_multi_aff_free(ma); - return NULL; -} - -/* Pullback "phase" over phase->tiling->shift_phase, which shifts - * phase 0 to phase 1. The pullback therefore takes a phase 1 - * description and turns it into a phase 0 description. - */ -static __isl_give ppcg_ht_phase *shift_phase(__isl_take ppcg_ht_phase *phase) -{ - ppcg_ht_tiling *tiling; - - if (!phase) - return NULL; - - tiling = phase->tiling; - return pullback_phase(phase, isl_multi_aff_copy(tiling->shift_phase)); -} - -/* Take a "phase" defined over the ts-space and plug in the projection - * from the input schedule space to the ts-space. - * The result is then defined over this input schedule space. - */ -static __isl_give ppcg_ht_phase *lift_phase(__isl_take ppcg_ht_phase *phase) -{ - ppcg_ht_tiling *tiling; - - if (!phase) - return NULL; - - tiling = phase->tiling; - return pullback_phase(phase, isl_multi_aff_copy(tiling->project_ts)); -} - -/* Compute the shift that should be added to the space band - * in order to be able to apply rectangular tiling to the space. - * Store the shift in phase->space_shift. - * - * In the first dimension, it is equal to shift_space - s. - * For phase 1, this results in - * - * (-(2 * shift_s)*T) % W - * - * In phase 0, the "s" in shift_space has been replaced by "s + shift_s", - * so the result is - * - * shift_s + (-(2 * shift_s)*T) % W - * - * In the other dimensions, the shift is equal to - * - * dl_i * local_time. - */ -static __isl_give ppcg_ht_phase *compute_space_shift( - __isl_take ppcg_ht_phase *phase) -{ - int i, n; - isl_space *space; - isl_local_space *ls; - isl_aff *aff, *s; - isl_multi_aff *space_shift; - - if (!phase) - return NULL; - - space = ppcg_ht_phase_get_input_space(phase); - space = isl_space_unwrap(space); - space = isl_space_range_map(space); - - space_shift = isl_multi_aff_zero(space); - aff = isl_aff_copy(phase->shift_space); - ls = isl_local_space_from_space(isl_aff_get_domain_space(aff)); - s = isl_aff_var_on_domain(ls, isl_dim_set, 1); - aff = isl_aff_sub(aff, s); - space_shift = isl_multi_aff_set_aff(space_shift, 0, aff); - - n = isl_multi_aff_dim(space_shift, isl_dim_out); - for (i = 1; i < n; ++i) { - isl_val *v; - isl_aff *time; - - v = ppcg_ht_bounds_get_lower(phase->tiling->bounds, i); - time = isl_aff_copy(phase->local_time); - time = isl_aff_scale_val(time, v); - space_shift = isl_multi_aff_set_aff(space_shift, i, time); - } - - if (!space_shift) - return ppcg_ht_phase_free(phase); - phase->space_shift = space_shift; - return phase; -} - -/* Compute the space tiling and store the result in phase->space_tile. - * The space tiling is of the form - * - * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size] - */ -static __isl_give ppcg_ht_phase *compute_space_tile( - __isl_take ppcg_ht_phase *phase) -{ - isl_space *space; - isl_multi_val *space_sizes; - isl_multi_aff *space_shift; - isl_multi_aff *tile; - - if (!phase) - return NULL; - - space = ppcg_ht_phase_get_input_space(phase); - space = isl_space_unwrap(space); - tile = isl_multi_aff_range_map(space); - space_shift = isl_multi_aff_copy(phase->space_shift); - tile = isl_multi_aff_add(space_shift, tile); - space_sizes = isl_multi_val_copy(phase->tiling->space_sizes); - tile = isl_multi_aff_scale_down_multi_val(tile, space_sizes); - tile = isl_multi_aff_floor(tile); - - if (!tile) - return ppcg_ht_phase_free(phase); - phase->space_tile = tile; - return phase; -} - -/* Construct a representation for one of the two phase for hybrid tiling - * "tiling". If "shift" is not set, then the phase is constructed - * directly from the hexagonal tile shape in "tiling", which represents - * the phase-1 tiles. If "shift" is set, then this tile shape is shifted - * back over tiling->shift_phase to obtain the phase-0 tiles. - * - * First copy data from "tiling", then optionally shift the phase and - * finally move the tiling from the "ts" space of "tiling" to - * the space of the input pattern. - * - * After the basic phase has been computed, also compute - * the corresponding space shift. - */ -static __isl_give ppcg_ht_phase *ppcg_ht_tiling_compute_phase( - __isl_keep ppcg_ht_tiling *tiling, int shift) -{ - ppcg_ht_phase *phase; - - phase = construct_phase(tiling); - if (shift) - phase = shift_phase(phase); - phase = lift_phase(phase); - - phase = compute_space_shift(phase); - phase = compute_space_tile(phase); - - return phase; -} - -/* Consruct a function that is equal to the time tile of "phase0" - * on the domain of "phase0" and equal to the time tile of "phase1" - * on the domain of "phase1". - * The two domains are assumed to form a partition of the input - * schedule space. - */ -static __isl_give isl_pw_multi_aff *combine_time_tile( - __isl_keep ppcg_ht_phase *phase0, __isl_keep ppcg_ht_phase *phase1) -{ - isl_aff *T; - isl_pw_aff *time, *time1; - - if (!phase0 || !phase1) - return NULL; - - T = isl_aff_copy(phase0->time_tile); - time = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase0), T); - - T = isl_aff_copy(phase1->time_tile); - time1 = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase1), T); - - time = isl_pw_aff_union_add(time, time1); - - return isl_pw_multi_aff_from_pw_aff(time); -} - -/* Name used in mark nodes that contain a pointer to a ppcg_ht_phase. - */ -static char *ppcg_phase_name = "phase"; - -/* Does "id" contain a pointer to a ppcg_ht_phase? - * That is, is it called "phase"? - */ -static isl_bool is_phase_id(__isl_keep isl_id *id) -{ - const char *name; - - name = isl_id_get_name(id); - if (!name) - return isl_bool_error; - - return !strcmp(name, ppcg_phase_name); -} - -/* Given a mark node with an identifier that points to a ppcg_ht_phase, - * extract this ppcg_ht_phase pointer. - */ -__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark( - __isl_keep isl_schedule_node *node) -{ - isl_bool is_phase; - isl_id *id; - void *p; - - if (!node) - return NULL; - if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) - isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, - "not a phase mark", return NULL); - - id = isl_schedule_node_mark_get_id(node); - is_phase = is_phase_id(id); - p = isl_id_get_user(id); - isl_id_free(id); - - if (is_phase < 0) - return NULL; - if (!is_phase) - isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, - "not a phase mark", return NULL); - - return p; -} - -/* Insert a mark node at "node" holding a pointer to "phase". - */ -static __isl_give isl_schedule_node *insert_phase( - __isl_take isl_schedule_node *node, __isl_take ppcg_ht_phase *phase) -{ - isl_ctx *ctx; - isl_id *id; - - if (!node) - goto error; - ctx = isl_schedule_node_get_ctx(node); - id = isl_id_alloc(ctx, ppcg_phase_name, phase); - if (!id) - goto error; - id = isl_id_set_free_user(id, &ppcg_ht_phase_free_wrap); - node = isl_schedule_node_insert_mark(node, id); - - return node; -error: - ppcg_ht_phase_free(phase); - isl_schedule_node_free(node); - return NULL; -} - -/* Construct a mapping from the elements of the original pair of bands - * to which tiling was applied that belong to a tile of "phase" - * to that tile, preserving the values for the outer bands. - * - * The mapping is of the form - * - * [[outer] -> [P -> C]] -> [[outer] -> [tile]] - * - * where tile is defined by a concatenation of the time_tile and - * the space_tile. - */ -static __isl_give isl_map *construct_tile_map(__isl_keep ppcg_ht_phase *phase) -{ - int depth; - isl_space *space; - isl_multi_aff *ma; - isl_multi_aff *tiling; - isl_map *el2tile; - - depth = isl_schedule_node_get_schedule_depth( - phase->tiling->input_node); - space = isl_aff_get_space(phase->time_tile); - space = isl_space_params(space); - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, depth); - space = isl_space_map_from_set(space); - ma = isl_multi_aff_identity(space); - - tiling = isl_multi_aff_flat_range_product( - isl_multi_aff_from_aff(isl_aff_copy(phase->time_tile)), - isl_multi_aff_copy(phase->space_tile)); - el2tile = isl_map_from_multi_aff(tiling); - el2tile = isl_map_intersect_domain(el2tile, - isl_set_copy(phase->domain)); - el2tile = isl_map_product(isl_map_from_multi_aff(ma), el2tile); - - return el2tile; -} - -/* Return a description of the full tiles of "phase" at the point - * in the original schedule tree where the tiling was applied. - * - * First construct a mapping from the input schedule dimensions - * up to an including the original pair of bands to which hybrid tiling - * was applied to schedule dimensions in which this original pair - * has been replaced by the tiles. - * This mapping is of the form - * - * [[outer] -> [P -> C]] -> [[outer] -> [tile]] - * - * Apply this mapping to the set of all values for the input - * schedule dimensions and then apply its inverse. - * The result is the set of values for the input schedule dimensions - * that would map to any of the tiles. Subtracting from this set - * the set of values that are actually executed produces the set - * of values that belong to a tile but that are not executed. - * Mapping these back to the tiles produces a description of - * the partial tiles. Subtracting these from the set of all tiles - * produces a description of the full tiles in the form - * - * [[outer] -> [tile]] - */ -static __isl_give isl_set *compute_full_tile(__isl_keep ppcg_ht_phase *phase) -{ - isl_schedule_node *node; - isl_union_set *domain; - isl_union_map *prefix, *schedule; - isl_set *all, *partial, *all_el; - isl_map *tile2el, *el2tile; - isl_multi_union_pw_aff *mupa; - - el2tile = construct_tile_map(phase); - tile2el = isl_map_reverse(isl_map_copy(el2tile)); - - node = phase->tiling->input_node; - prefix = isl_schedule_node_get_prefix_schedule_union_map(node); - domain = isl_schedule_node_get_domain(node); - mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); - schedule = isl_union_map_from_multi_union_pw_aff(mupa); - schedule = isl_union_map_range_product(prefix, schedule); - all_el = isl_set_from_union_set(isl_union_set_apply(domain, schedule)); - all_el = isl_set_coalesce(all_el); - - all = isl_set_apply(isl_set_copy(all_el), isl_map_copy(el2tile)); - - partial = isl_set_copy(all); - partial = isl_set_apply(partial, tile2el); - partial = isl_set_subtract(partial, all_el); - partial = isl_set_apply(partial, el2tile); - - return isl_set_subtract(all, partial); -} - -/* Copy the AST loop types of the non-isolated part to those - * of the isolated part. - */ -static __isl_give isl_schedule_node *set_isolate_loop_type( - __isl_take isl_schedule_node *node) -{ - int i, n; - - n = isl_schedule_node_band_n_member(node); - for (i = 0; i < n; ++i) { - enum isl_ast_loop_type type; - - type = isl_schedule_node_band_member_get_ast_loop_type(node, i); - node = isl_schedule_node_band_member_set_isolate_ast_loop_type( - node, i, type); - } - - return node; -} - -/* If options->isolate_full_tiles is set, then mark the full tiles - * in "node" for isolation. The full tiles are derived from "phase". - * "node" may point to a part of the tiling, e.g., the space tiling. - * - * The full tiles are originally computed in the form - * - * [[outer] -> [tile]] - * - * However, the band that "node" points to may only contain - * subset of the tile dimensions. - * The description above is therefore treated as - * - * [[outer] -> [before; this; after]] - * - * before is of size "pos"; this is of size "dim"; and - * after is of size "out - pos - dim". - * The after part is first project out. Then the range is split - * into a before and this part and finally the before part is moved - * to the domain, resulting in - * - * [[outer; before] -> [this]] - * - * This description is then used as the isolate option. - * - * The AST loop type for the isolated part is set to be the same - * as that of the non-isolated part. - */ -static __isl_give isl_schedule_node *ppcg_ht_phase_isolate_full_tile_node( - __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node, - struct ppcg_options *options) -{ - int in, out, pos, depth, dim; - isl_space *space; - isl_multi_aff *ma1, *ma2; - isl_set *tile; - isl_map *map; - isl_set *set; - isl_union_set *opt; - - if (!options->isolate_full_tiles) - return node; - - depth = isl_schedule_node_get_schedule_depth(node); - dim = isl_schedule_node_band_n_member(node); - - tile = compute_full_tile(phase); - map = isl_set_unwrap(tile); - in = isl_map_dim(map, isl_dim_in); - out = isl_map_dim(map, isl_dim_out); - pos = depth - in; - map = isl_map_project_out(map, isl_dim_out, pos + dim, - out - (pos + dim)); - space = isl_space_range(isl_map_get_space(map)); - ma1 = isl_multi_aff_project_out_map(isl_space_copy(space), - isl_dim_set, pos, dim); - ma2 = isl_multi_aff_project_out_map(space, isl_dim_set, 0, pos); - ma1 = isl_multi_aff_range_product(ma1, ma2); - map = isl_map_apply_range(map, isl_map_from_multi_aff(ma1)); - map = isl_map_uncurry(map); - map = isl_map_flatten_domain(map); - set = isl_map_wrap(map); - set = isl_set_set_tuple_name(set, "isolate"); - - opt = isl_schedule_node_band_get_ast_build_options(node); - opt = isl_union_set_add_set(opt, set); - node = isl_schedule_node_band_set_ast_build_options(node, opt); - node = set_isolate_loop_type(node); - - return node; -} - -/* Insert a band node for performing the space tiling for "phase" at "node". - * In particular, insert a band node with partial schedule - * - * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size)] - * - * pulled back over the input schedule. - * "options" determines whether full tiles should be separated - * from partial tiles. - * - * The first tile dimension iterates over the hexagons in the same - * phase, which are independent by construction. The first dimension - * is therefore marked coincident. - * All dimensions are also marked for being generated as atomic loops - * because separation is usually not desirable on tile loops. - */ -static __isl_give isl_schedule_node *insert_space_tiling( - __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node, - struct ppcg_options *options) -{ - isl_multi_aff *space_tile; - isl_multi_union_pw_aff *mupa; - - if (!phase) - return isl_schedule_node_free(node); - - space_tile = isl_multi_aff_copy(phase->space_tile); - mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); - mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_tile); - node = isl_schedule_node_insert_partial_schedule(node, mupa); - node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); - node = ppcg_ht_phase_isolate_full_tile_node(phase, node, options); - node = isl_schedule_node_band_member_set_coincident(node, 0, 1); - - return node; -} - -/* Given a pointer "node" to (a copy of) the original child node - * in the input pattern, adjust its partial schedule such that - * it starts at zero within each tile. - * - * That is, replace "s" by (s + space_shift) % space_sizes. - */ -__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point( - __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node) -{ - isl_multi_val *space_sizes; - isl_multi_aff *space_shift; - isl_multi_union_pw_aff *mupa; - - space_shift = isl_multi_aff_copy(phase->space_shift); - mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); - mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_shift); - node = isl_schedule_node_band_shift(node, mupa); - space_sizes = isl_multi_val_copy(phase->tiling->space_sizes); - node = isl_schedule_node_band_mod(node, space_sizes); - - return node; -} - -/* Does - * - * s0 > delta + 2 * {delta * h} - 1 - * - * hold? - */ -static isl_bool wide_enough(__isl_keep isl_val *s0, __isl_keep isl_val *delta, - __isl_keep isl_val *h) -{ - isl_val *v, *v2; - isl_bool ok; - - v = isl_val_mul(isl_val_copy(delta), isl_val_copy(h)); - v2 = isl_val_floor(isl_val_copy(v)); - v = isl_val_sub(v, v2); - v = isl_val_mul_ui(v, 2); - v = isl_val_add(v, isl_val_copy(delta)); - v = isl_val_sub_ui(v, 1); - ok = isl_val_gt(s0, v); - isl_val_free(v); - - return ok; -} - -/* Is the tile size specified by "sizes" wide enough in the first space - * dimension, i.e., the base of the hexagon? This ensures that, - * after hybrid tiling using "bounds" and these sizes, - * neighboring hexagons in the same phase are far enough apart - * that they do not depend on each other. - * The test is only meaningful if the bounds are valid. - * - * Let st be (half) the size in the time dimension and s0 the base - * size in the first space dimension. Let delta be the dependence - * distance in either positive or negative direction. In principle, - * it should be enough to have s0 + 1 > delta, i.e., s0 >= delta. - * However, in case of fractional delta, the tile is not extended - * with delta * (st - 1), but instead with floor(delta * (st - 1)). - * The condition therefore needs to be adjusted to - * - * s0 + 1 > delta + 2 {delta * (st - 1)} - * - * (with {} the fractional part) to account for the two slanted sides. - * The condition in the paper "Hybrid Hexagonal/Classical Tiling for GPUs" - * translates to - * - * s0 >= delta + {delta * (st - 1)} - * - * Since 1 > frac(delta * (st - 1)), this condition implies - * the condition above. - * - * The condition is checked for both directions. - */ -isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds, - __isl_keep isl_multi_val *sizes) -{ - isl_val *s0, *h; - isl_val *delta; - isl_bool ok; - - ok = ppcg_ht_bounds_is_valid(bounds); - if (ok < 0 || !ok) - return ok; - - h = isl_val_sub_ui(isl_multi_val_get_val(sizes, 0), 1); - s0 = isl_multi_val_get_val(sizes, 1); - - delta = ppcg_ht_bounds_get_lower(bounds, 0); - ok = wide_enough(s0, delta, h); - isl_val_free(delta); - - delta = ppcg_ht_bounds_get_upper(bounds); - if (ok == isl_bool_true) - ok = wide_enough(s0, delta, h); - isl_val_free(delta); - - isl_val_free(s0); - isl_val_free(h); - - return ok; -} - -/* Check that the tile will be wide enough in the first space - * dimension, i.e., the base of the hexagon. This ensures that - * neighboring hexagons in the same phase are far enough apart - * that they do not depend on each other. - * - * Error out if the condition fails to hold. - */ -static isl_stat check_width(__isl_keep ppcg_ht_bounds *bounds, - __isl_keep isl_multi_val *sizes) -{ - isl_bool ok; - - ok = ppcg_ht_bounds_supports_sizes(bounds, sizes); - - if (ok < 0) - return isl_stat_error; - if (!ok) - isl_die(isl_multi_val_get_ctx(sizes), isl_error_invalid, - "base of hybrid tiling hexagon not sufficiently wide", - return isl_stat_error); - - return isl_stat_ok; -} - -/* Given valid bounds on the relative dependence distances for - * the pair of nested nodes that "node" point to, as well as sufficiently - * wide tile sizes "sizes", insert the corresponding time and space tiling - * at "node", along with a pair of phase nodes that can be used - * to make further changes. - * The space of "sizes" should be the product of the spaces - * of the schedules of the pair of parent and child nodes. - * "options" determines whether full tiles should be separated - * from partial tiles. - * - * In particular, given an input of the form - * - * P - C - ... - * - * the output has the form - * - * /- F0 - M0 - CT0 - P - C - ... - * PT - seq - * \- F1 - M1 - CT1 - P - C - ... - * - * PT is the global time tiling. Within each of these tiles, - * two phases are executed in order. Within each phase, the schedule - * space is further subdivided into tiles through CT0 and CT1. - * The first dimension of each of these iterates over the hexagons - * within a phase and these are independent by construction. - * The F0 and F1 filters filter the statement instances that belong - * to the corresponding phase. The M0 and M1 marks contain a pointer - * to a ppcg_ht_phase object that can be used to perform further changes. - * - * After checking that input satisfies the requirements, - * a data structure is constructed that represents the tiling and - * two additional data structures are constructed for the two phases - * of the tiling. These are then used to define the filters F0 and F1 and - * combined to construct the time tiling PT. - * Then the time tiling node PT is inserted, followed by - * the sequence with the two filters, the CT space tiling nodes and - * the phase markers M. - */ -__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling( - __isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes, - __isl_take isl_schedule_node *node, struct ppcg_options *options) -{ - isl_ctx *ctx; - isl_union_set *phase0; - isl_union_set *phase1; - isl_multi_union_pw_aff *input, *dom_time; - isl_union_pw_multi_aff *upma; - isl_pw_multi_aff *time; - isl_union_set_list *phases; - ppcg_ht_tiling *tiling; - ppcg_ht_phase *phase_0; - ppcg_ht_phase *phase_1; - - if (!node || !sizes || !bounds) - goto error; - if (check_input_pattern(node) < 0 || check_width(bounds, sizes) < 0) - goto error; - - ctx = isl_schedule_node_get_ctx(node); - - input = extract_input_schedule(node); - - tiling = ppcg_ht_bounds_construct_tiling(bounds, node, input, sizes); - phase_0 = ppcg_ht_tiling_compute_phase(tiling, 1); - phase_1 = ppcg_ht_tiling_compute_phase(tiling, 0); - time = combine_time_tile(phase_0, phase_1); - ppcg_ht_tiling_free(tiling); - - upma = isl_union_pw_multi_aff_from_multi_union_pw_aff( - isl_multi_union_pw_aff_copy(input)); - phase0 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_0)); - phase0 = isl_union_set_preimage_union_pw_multi_aff(phase0, - isl_union_pw_multi_aff_copy(upma)); - phase1 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_1)); - phase1 = isl_union_set_preimage_union_pw_multi_aff(phase1, upma); - - phases = isl_union_set_list_alloc(ctx, 2); - phases = isl_union_set_list_add(phases, phase0); - phases = isl_union_set_list_add(phases, phase1); - - dom_time = isl_multi_union_pw_aff_apply_pw_multi_aff(input, time); - node = isl_schedule_node_insert_partial_schedule(node, dom_time); - - node = isl_schedule_node_child(node, 0); - - node = isl_schedule_node_insert_sequence(node, phases); - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_child(node, 0); - node = insert_space_tiling(phase_0, node, options); - node = insert_phase(node, phase_0); - node = isl_schedule_node_parent(node); - node = isl_schedule_node_next_sibling(node); - node = isl_schedule_node_child(node, 0); - node = insert_space_tiling(phase_1, node, options); - node = insert_phase(node, phase_1); - node = isl_schedule_node_parent(node); - node = isl_schedule_node_parent(node); - - node = isl_schedule_node_parent(node); - - isl_multi_val_free(sizes); - return node; -error: - isl_multi_val_free(sizes); - isl_schedule_node_free(node); - ppcg_ht_bounds_free(bounds); - return NULL; -} - -/* Given a branch "node" that contains a sequence node with two phases - * of hybrid tiling as input, call "fn" on each of the two phase marker - * nodes. - * - * That is, the input is as follows - * - * /- F0 - M0 - ... - * ... - seq - * \- F1 - M1 - ... - * - * and "fn" is called on M0 and on M1. - */ -__isl_give isl_schedule_node *hybrid_tile_foreach_phase( - __isl_take isl_schedule_node *node, - __isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node, - void *user), void *user) -{ - int depth0, depth; - - depth0 = isl_schedule_node_get_tree_depth(node); - - while (node && - isl_schedule_node_get_type(node) != isl_schedule_node_sequence) - node = isl_schedule_node_child(node, 0); - - node = isl_schedule_node_child(node, 0); - node = isl_schedule_node_child(node, 0); - if (!node) - return NULL; - node = fn(node, user); - node = isl_schedule_node_parent(node); - node = isl_schedule_node_next_sibling(node); - node = isl_schedule_node_child(node, 0); - if (!node) - return NULL; - node = fn(node, user); - node = isl_schedule_node_parent(node); - node = isl_schedule_node_parent(node); - - depth = isl_schedule_node_get_tree_depth(node); - node = isl_schedule_node_ancestor(node, depth - depth0); - - return node; -} - -/* This function is called on each of the two phase marks - * in a hybrid tiling tree. - * Drop the phase mark at "node". - */ -static __isl_give isl_schedule_node *drop_phase_mark( - __isl_take isl_schedule_node *node, void *user) -{ - isl_id *id; - isl_bool is_phase; - - if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) - return node; - - id = isl_schedule_node_mark_get_id(node); - is_phase = is_phase_id(id); - isl_id_free(id); - - if (is_phase < 0) - return isl_schedule_node_free(node); - if (is_phase) - node = isl_schedule_node_delete(node); - - return node; -} - -/* Given a branch "node" that contains a sequence node with two phases - * of hybrid tiling as input, remove the two phase marker nodes. - * - * That is, the input is as follows - * - * /- F0 - M0 - ... - * ... - seq - * \- F1 - M1 - ... - * - * and the output is - * - * /- F0 - ... - * ... - seq - * \- F1 - ... - */ -__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks( - __isl_take isl_schedule_node *node) -{ - return hybrid_tile_foreach_phase(node, &drop_phase_mark, NULL); -} diff --git a/polly/lib/External/ppcg/ocl_utilities.h b/polly/lib/External/ppcg/ocl_utilities.h deleted file mode 100644 --- a/polly/lib/External/ppcg/ocl_utilities.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef OCL_UTILITIES_H -#define OCL_UTILITIES_H - -#if defined(__APPLE__) -#include -#else -#include -#endif - -/* Return the OpenCL error string for a given error number. - */ -const char *opencl_error_string(cl_int error); - -/* Find a GPU or a CPU associated with the first available platform. - * If use_gpu is set, then this function first tries to look for a GPU - * in the first available platform. - * If this fails or if use_gpu is not set, then it tries to use the CPU. - */ -cl_device_id opencl_create_device(int use_gpu); - -/* Create an OpenCL program from a string and compile it. - */ -cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev, - const char *program_source, size_t program_size, - const char *opencl_options); - -/* Create an OpenCL program from a source file and compile it. - */ -cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev, - const char* filename, const char* opencl_options); - -#endif diff --git a/polly/lib/External/ppcg/ocl_utilities.c b/polly/lib/External/ppcg/ocl_utilities.c deleted file mode 100644 --- a/polly/lib/External/ppcg/ocl_utilities.c +++ /dev/null @@ -1,174 +0,0 @@ -#include -#include -#include "ocl_utilities.h" - -/* Return the OpenCL error string for a given error number. - */ -const char *opencl_error_string(cl_int error) -{ - int errorCount; - int index; - - static const char *errorString[] = { - [CL_SUCCESS] = "CL_SUCCESS", - [-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND", - [-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE", - [-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE", - [-CL_MEM_OBJECT_ALLOCATION_FAILURE] = - "CL_MEM_OBJECT_ALLOCATION_FAILURE", - [-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES", - [-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY", - [-CL_PROFILING_INFO_NOT_AVAILABLE] = - "CL_PROFILING_INFO_NOT_AVAILABLE", - [-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP", - [-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH", - [-CL_IMAGE_FORMAT_NOT_SUPPORTED] = - "CL_IMAGE_FORMAT_NOT_SUPPORTED", - [-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE", - [-CL_MAP_FAILURE] = "CL_MAP_FAILURE", - [-CL_INVALID_VALUE] = "CL_INVALID_VALUE", - [-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE", - [-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM", - [-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE", - [-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT", - [-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES", - [-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE", - [-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR", - [-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT", - [-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] = - "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", - [-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE", - [-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER", - [-CL_INVALID_BINARY] = "CL_INVALID_BINARY", - [-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS", - [-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM", - [-CL_INVALID_PROGRAM_EXECUTABLE] = - "CL_INVALID_PROGRAM_EXECUTABLE", - [-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME", - [-CL_INVALID_KERNEL_DEFINITION] = - "CL_INVALID_KERNEL_DEFINITION", - [-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL", - [-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX", - [-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE", - [-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE", - [-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS", - [-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION", - [-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE", - [-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE", - [-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET", - [-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST", - [-CL_INVALID_EVENT] = "CL_INVALID_EVENT", - [-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION", - [-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT", - [-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE", - [-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL", - [-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE", - [-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY" - }; - - errorCount = sizeof(errorString) / sizeof(errorString[0]); - index = -error; - - return (index >= 0 && index < errorCount) ? - errorString[index] : "Unspecified Error"; -} - -/* Find a GPU or a CPU associated with the first available platform. - * If use_gpu is set, then this function first tries to look for a GPU - * in the first available platform. - * If this fails or if use_gpu is not set, then it tries to use the CPU. - */ -cl_device_id opencl_create_device(int use_gpu) -{ - cl_platform_id platform; - cl_device_id dev; - int err; - - err = clGetPlatformIDs(1, &platform, NULL); - if (err < 0) { - fprintf(stderr, "Error %s while looking for a platform.\n", - opencl_error_string(err)); - exit(1); - } - - err = CL_DEVICE_NOT_FOUND; - if (use_gpu) - err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, - NULL); - if (err == CL_DEVICE_NOT_FOUND) - err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, - NULL); - if (err < 0) { - fprintf(stderr, "Error %s while looking for a device.\n", - opencl_error_string(err)); - exit(1); - } - return dev; -} - -/* Create an OpenCL program from a string and compile it. - */ -cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev, - const char *program_source, size_t program_size, - const char *opencl_options) -{ - int err; - cl_program program; - char *program_log; - size_t log_size; - - program = clCreateProgramWithSource(ctx, 1, - &program_source, &program_size, &err); - if (err < 0) { - fprintf(stderr, "Could not create the program\n"); - exit(1); - } - err = clBuildProgram(program, 0, NULL, opencl_options, NULL, NULL); - if (err < 0) { - fprintf(stderr, "Could not build the program.\n"); - clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, - NULL, &log_size); - program_log = (char *) malloc(log_size + 1); - program_log[log_size] = '\0'; - clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, - log_size + 1, program_log, NULL); - fprintf(stderr, "%s\n", program_log); - free(program_log); - exit(1); - } - return program; -} - -/* Create an OpenCL program from a source file and compile it. - */ -cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev, - const char* filename, const char* opencl_options) -{ - cl_program program; - FILE *program_file; - char *program_source; - size_t program_size, read; - - program_file = fopen(filename, "r"); - if (program_file == NULL) { - fprintf(stderr, "Could not find the source file.\n"); - exit(1); - } - fseek(program_file, 0, SEEK_END); - program_size = ftell(program_file); - rewind(program_file); - program_source = (char *) malloc(program_size + 1); - program_source[program_size] = '\0'; - read = fread(program_source, sizeof(char), program_size, program_file); - if (read != program_size) { - fprintf(stderr, "Error while reading the kernel.\n"); - exit(1); - } - fclose(program_file); - - program = opencl_build_program_from_string(ctx, dev, program_source, - program_size, opencl_options); - free(program_source); - - return program; -} diff --git a/polly/lib/External/ppcg/opencl.h b/polly/lib/External/ppcg/opencl.h deleted file mode 100644 --- a/polly/lib/External/ppcg/opencl.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _OPENCL_H -#define _OPENCL_H - -#include -#include "ppcg_options.h" -#include "ppcg.h" - -int generate_opencl(isl_ctx *ctx, struct ppcg_options *options, - const char *input, const char *output); - -#endif diff --git a/polly/lib/External/ppcg/opencl_test.sh.in b/polly/lib/External/ppcg/opencl_test.sh.in deleted file mode 100644 --- a/polly/lib/External/ppcg/opencl_test.sh.in +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/sh - -keep=no - -for option; do - case "$option" in - --keep) - keep=yes - ;; - esac -done - -EXEEXT=@EXEEXT@ -VERSION=@GIT_HEAD_VERSION@ -CC="@CC@" -CFLAGS="--std=gnu99" -srcdir="@srcdir@" - -if [ $keep = "yes" ]; then - OUTDIR="opencl_test.$VERSION" - mkdir "$OUTDIR" || exit 1 -else - if test "x$TMPDIR" = "x"; then - TMPDIR=/tmp - fi - OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1 -fi - -run_tests () { - subdir=$1 - ppcg_options=$2 - - echo Test with PPCG options \'$ppcg_options\' - mkdir ${OUTDIR}/${subdir} || exit 1 - for i in $srcdir/tests/*.c; do - echo $i - name=`basename $i` - name="${name%.c}" - out_c="${OUTDIR}/${subdir}/$name.ppcg.c" - out="${OUTDIR}/${subdir}/$name.ppcg$EXEEXT" - options="--target=opencl --opencl-no-use-gpu $ppcg_options" - functions="$srcdir/tests/${name}_opencl_functions.cl" - if test -f $functions; then - options="$options --opencl-include-file=$functions" - options="$options --opencl-compiler-options=-I." - fi - ./ppcg$EXEEXT $options $i -o "$out_c" || exit - $CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \ - -I. "$out_c" -o "$out" || exit - $out || exit - done -} - -run_tests default -run_tests embed --opencl-embed-kernel-code - -for i in $srcdir/examples/*.c; do - echo $i - name=`basename $i` - name="${name%.c}" - exe_ref="${OUTDIR}/$name.ref$EXEEXT" - gen_ocl="${OUTDIR}/$name.ppcg.c" - exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT" - output_ref="${OUTDIR}/$name.ref.out" - output_ocl="${OUTDIR}/$name.ppcg.out" - $CC $CFLAGS $i -o $exe_ref || exit - ./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \ - exit - $CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \ - "$gen_ocl" -o "$exe_ocl" || exit - $exe_ref > $output_ref || exit - $exe_ocl > $output_ocl || exit - cmp $output_ref $output_ocl || exit -done - -if [ $keep = "no" ]; then - rm -r "${OUTDIR}" -fi diff --git a/polly/lib/External/ppcg/polybench_test.sh.in b/polly/lib/External/ppcg/polybench_test.sh.in deleted file mode 100644 --- a/polly/lib/External/ppcg/polybench_test.sh.in +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/sh - -keep=no -verbose=no - -for option; do - case "$option" in - --keep) - keep=yes - ;; - --verbose) - verbose=yes - ;; - esac -done - -EXEEXT=@EXEEXT@ -DIR=@POLYBENCH_DIR@ -VERSION=@GIT_HEAD_VERSION@ -SIZE=-DMINI_DATASET -CC="@CC@" -HAVE_OPENCL=@HAVE_OPENCL@ -HAVE_OPENMP=@HAVE_OPENMP@ -srcdir="@srcdir@" -if [ $keep = "yes" ]; then - OUTDIR="out.$VERSION" - mkdir "$OUTDIR" || exit 1 -else - if test "x$TMPDIR" = "x"; then - TMPDIR=/tmp - fi - OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1 -fi -CPPFLAGS="-DPOLYBENCH_USE_C99_PROTO -DPOLYBENCH_DUMP_ARRAYS" -CPPFLAGS="$CPPFLAGS $SIZE -I $DIR/utilities" -CFLAGS="-lm --std=gnu99" - -echo "Running tests in folder ${OUTDIR}" - -run_tests () { - ext=$1 - - ppcg_options=$2 - cc_options=$3 - - if [ "x$ppcg_options" = "x" ]; then - ppcg_option_str="none" - else - ppcg_option_str=$ppcg_options - fi - - if [ "x$cc_options" = "x" ]; then - cc_option_str="none" - else - cc_option_str=$cc_options - fi - - echo Test: $ext, ppcg options: $ppcg_option_str, CC options: $cc_option_str - for i in `cat $DIR/utilities/benchmark_list`; do - echo $i - name=`basename $i` - name=${name%.c} - source_opt="${OUTDIR}/$name.$ext.c" - prog_orig=${OUTDIR}/$name.orig${EXEEXT} - prog_opt=${OUTDIR}/$name.$ext${EXEEXT} - output_orig=${OUTDIR}/$name.orig.out - output_opt=${OUTDIR}/$name.$ext.out - dir=`dirname $i` - if [ $verbose = "yes" ]; then - echo ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i \ - $CPPFLAGS -o $source_opt $ppcg_options - fi - ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i $CPPFLAGS \ - -o $source_opt $ppcg_options || exit - $CC -I $DIR/$dir $CPPFLAGS $DIR/$i -o $prog_orig \ - $DIR/utilities/polybench.c $CFLAGS - $prog_orig 2> $output_orig - if [ $verbose = "yes" ]; then - echo $CC -I $DIR/$dir $CPPFLAGS $source_opt \ - -o $prog_opt $DIR/utilities/polybench.c \ - $CFLAGS $cc_options - fi - $CC -I $DIR/$dir $CPPFLAGS $source_opt -o $prog_opt \ - $DIR/utilities/polybench.c $CFLAGS $cc_options || exit - - $prog_opt 2> $output_opt - cmp $output_orig $output_opt || exit - done -} - -run_tests ppcg "--target=c --tile" -run_tests ppcg_live "--target=c --no-live-range-reordering --tile" - -# Test OpenMP code, if compiler supports openmp -if [ $HAVE_OPENMP = "yes" ]; then - run_tests ppcg_omp "--target=c --openmp" -fopenmp - echo Introduced `grep -R 'omp parallel' "${OUTDIR}" | wc -l` '"pragma omp parallel for"' -else - echo Compiler does not support OpenMP. Skipping OpenMP tests. -fi - -if [ $HAVE_OPENCL = "yes" ]; then - run_tests ppcg_opencl "--target=opencl --opencl-no-use-gpu" \ - "-I $srcdir $srcdir/ocl_utilities.c -lOpenCL" -fi - -if [ $keep = "no" ]; then - rm -r "${OUTDIR}" -fi diff --git a/polly/lib/External/ppcg/ppcg.h b/polly/lib/External/ppcg/ppcg.h deleted file mode 100644 --- a/polly/lib/External/ppcg/ppcg.h +++ /dev/null @@ -1,128 +0,0 @@ -#ifndef PPCG_H -#define PPCG_H - -#include -#include -#include -#include -#include -#include - -#include "ppcg_options.h" - -const char *ppcg_base_name(const char *filename); -int ppcg_extract_base_name(char *name, const char *input); - -/* Representation of the scop for use inside PPCG. - * - * "options" are the options specified by the user. - * Some fields in this structure may depend on some of the options. - * - * "start" and "end" are file offsets of the corresponding program text. - * "context" represents constraints on the parameters. - * "domain" is the union of all iteration domains. - * "call" contains the iteration domains of statements with a call expression. - * "reads" contains all potential read accesses. - * "tagged_reads" is the same as "reads", except that the domain is a wrapped - * relation mapping an iteration domain to a reference identifier - * "live_in" contains the potential read accesses that potentially - * have no corresponding writes in the scop. - * "may_writes" contains all potential write accesses. - * "tagged_may_writes" is the same as "may_writes", except that the domain - * is a wrapped relation mapping an iteration domain - * to a reference identifier - * "must_writes" contains all definite write accesses. - * "tagged_must_writes" is the same as "must_writes", except that the domain - * is a wrapped relation mapping an iteration domain - * to a reference identifier - * "live_out" contains the potential write accesses that are potentially - * not killed by any kills or any other writes. - * "must_kills" contains all definite kill accesses. - * "tagged_must_kills" is the same as "must_kills", except that the domain - * is a wrapped relation mapping an iteration domain - * to a reference identifier. - * - * "tagger" maps tagged iteration domains to the corresponding untagged - * iteration domain. - * - * "independence" is the union of all independence filters. - * - * "dep_flow" represents the potential flow dependences. - * "tagged_dep_flow" is the same as "dep_flow", except that both domain and - * range are wrapped relations mapping an iteration domain to - * a reference identifier. May be NULL if not computed. - * "dep_false" represents the potential false (anti and output) dependences. - * "dep_forced" represents the validity constraints that should be enforced - * even when live-range reordering is used. - * In particular, these constraints ensure that all live-in - * accesses remain live-in and that all live-out accesses remain live-out - * and that multiple potential sources for the same read are - * executed in the original order. - * "dep_order"/"tagged_dep_order" represents the order dependences between - * the live range intervals in "dep_flow"/"tagged_dep_flow". - * It is only used if the live_range_reordering - * option is set. Otherwise it is NULL. - * If "dep_order" is used, then "dep_false" only contains a limited - * set of anti and output dependences. - * "schedule" represents the (original) schedule. - * - * "names" contains all variable names that are in use by the scop. - * The names are mapped to a dummy value. - * - * "pet" is the original pet_scop. - */ -struct ppcg_scop { - struct ppcg_options *options; - - unsigned start; - unsigned end; - - isl_set *context; - isl_union_set *domain; - isl_union_set *call; - isl_union_map *tagged_reads; - isl_union_map *reads; - isl_union_map *live_in; - isl_union_map *tagged_may_writes; - isl_union_map *may_writes; - isl_union_map *tagged_must_writes; - isl_union_map *must_writes; - isl_union_map *live_out; - isl_union_map *tagged_must_kills; - isl_union_map *must_kills; - - isl_union_pw_multi_aff *tagger; - - isl_union_map *independence; - - isl_union_map *dep_flow; - isl_union_map *tagged_dep_flow; - isl_union_map *dep_false; - isl_union_map *dep_forced; - isl_union_map *dep_order; - isl_union_map *tagged_dep_order; - isl_schedule *schedule; - - isl_id_to_ast_expr *names; - - struct pet_scop *pet; -}; - -int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop); -__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop, - int n, const char *prefix); - -int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out, - struct ppcg_options *options, - __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, - struct ppcg_scop *scop, void *user), void *user); - -__isl_give isl_schedule *ppcg_compute_schedule( - __isl_take isl_schedule_constraints *sc, - __isl_keep isl_schedule *schedule, struct ppcg_options *options); - -void compute_tagger(struct ppcg_scop *ps); -void compute_dependences(struct ppcg_scop *scop); -void eliminate_dead_code(struct ppcg_scop *ps); -void *ppcg_scop_free(struct ppcg_scop *ps); -#endif diff --git a/polly/lib/External/ppcg/ppcg.c b/polly/lib/External/ppcg/ppcg.c deleted file mode 100644 --- a/polly/lib/External/ppcg/ppcg.c +++ /dev/null @@ -1,1067 +0,0 @@ -/* - * Copyright 2011 INRIA Saclay - * Copyright 2013 Ecole Normale Superieure - * Copyright 2015 Sven Verdoolaege - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, - * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, - * 91893 Orsay, France - * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ppcg.h" -#include "ppcg_options.h" -#include "cuda.h" -#include "opencl.h" -#include "cpu.h" - -struct options { - struct pet_options *pet; - struct ppcg_options *ppcg; - char *input; - char *output; -}; - -const char *ppcg_version(void); -static void print_version(void) -{ - printf("%s", ppcg_version()); -} - -ISL_ARGS_START(struct options, options_args) -ISL_ARG_CHILD(struct options, pet, "pet", &pet_options_args, "pet options") -ISL_ARG_CHILD(struct options, ppcg, NULL, &ppcg_options_args, "ppcg options") -ISL_ARG_STR(struct options, output, 'o', NULL, - "filename", NULL, "output filename (c and opencl targets)") -ISL_ARG_ARG(struct options, input, "input", NULL) -ISL_ARG_VERSION(print_version) -ISL_ARGS_END - -ISL_ARG_DEF(options, struct options, options_args) - -/* Return a pointer to the final path component of "filename" or - * to "filename" itself if it does not contain any components. - */ -const char *ppcg_base_name(const char *filename) -{ - const char *base; - - base = strrchr(filename, '/'); - if (base) - return ++base; - else - return filename; -} - -/* Copy the base name of "input" to "name" and return its length. - * "name" is not NULL terminated. - * - * In particular, remove all leading directory components and - * the final extension, if any. - */ -int ppcg_extract_base_name(char *name, const char *input) -{ - const char *base; - const char *ext; - int len; - - base = ppcg_base_name(input); - ext = strrchr(base, '.'); - len = ext ? ext - base : strlen(base); - - memcpy(name, base, len); - - return len; -} - -/* Does "scop" refer to any arrays that are declared, but not - * exposed to the code after the scop? - */ -int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop) -{ - int i; - - if (!scop) - return 0; - - // This is a pet feature not available in Polly. - return 0; - - for (i = 0; i < scop->pet->n_array; ++i) - if (scop->pet->arrays[i]->declared && - !scop->pet->arrays[i]->exposed) - return 1; - - return 0; -} - -/* Collect all variable names that are in use in "scop". - * In particular, collect all parameters in the context and - * all the array names. - * Store these names in an isl_id_to_ast_expr by mapping - * them to a dummy value (0). - */ -static __isl_give isl_id_to_ast_expr *collect_names(struct pet_scop *scop) -{ - int i, n; - isl_ctx *ctx; - isl_ast_expr *zero; - isl_id_to_ast_expr *names; - - ctx = isl_set_get_ctx(scop->context); - - n = isl_set_dim(scop->context, isl_dim_param); - - names = isl_id_to_ast_expr_alloc(ctx, n + scop->n_array); - zero = isl_ast_expr_from_val(isl_val_zero(ctx)); - - for (i = 0; i < n; ++i) { - isl_id *id; - - id = isl_set_get_dim_id(scop->context, isl_dim_param, i); - names = isl_id_to_ast_expr_set(names, - id, isl_ast_expr_copy(zero)); - } - - for (i = 0; i < scop->n_array; ++i) { - struct pet_array *array = scop->arrays[i]; - isl_id *id; - - id = isl_set_get_tuple_id(array->extent); - names = isl_id_to_ast_expr_set(names, - id, isl_ast_expr_copy(zero)); - } - - isl_ast_expr_free(zero); - - return names; -} - -/* Return an isl_id called "prefix%d", with "%d" set to "i". - * If an isl_id with such a name already appears among the variable names - * of "scop", then adjust the name to "prefix%d_%d". - */ -static __isl_give isl_id *generate_name(struct ppcg_scop *scop, - const char *prefix, int i) -{ - int j; - char name[16]; - isl_ctx *ctx; - isl_id *id; - int has_name; - - ctx = isl_set_get_ctx(scop->context); - snprintf(name, sizeof(name), "%s%d", prefix, i); - id = isl_id_alloc(ctx, name, NULL); - - j = 0; - while ((has_name = isl_id_to_ast_expr_has(scop->names, id)) == 1) { - isl_id_free(id); - snprintf(name, sizeof(name), "%s%d_%d", prefix, i, j++); - id = isl_id_alloc(ctx, name, NULL); - } - - return has_name < 0 ? isl_id_free(id) : id; -} - -/* Return a list of "n" isl_ids of the form "prefix%d". - * If an isl_id with such a name already appears among the variable names - * of "scop", then adjust the name to "prefix%d_%d". - */ -__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop, - int n, const char *prefix) -{ - int i; - isl_ctx *ctx; - isl_id_list *names; - - ctx = isl_set_get_ctx(scop->context); - names = isl_id_list_alloc(ctx, n); - for (i = 0; i < n; ++i) { - isl_id *id; - - id = generate_name(scop, prefix, i); - names = isl_id_list_add(names, id); - } - - return names; -} - -/* Is "stmt" not a kill statement? - */ -static int is_not_kill(struct pet_stmt *stmt) -{ - return !pet_stmt_is_kill(stmt); -} - -/* Collect the iteration domains of the statements in "scop" that - * satisfy "pred". - */ -static __isl_give isl_union_set *collect_domains(struct pet_scop *scop, - int (*pred)(struct pet_stmt *stmt)) -{ - int i; - isl_set *domain_i; - isl_union_set *domain; - - if (!scop) - return NULL; - - domain = isl_union_set_empty(isl_set_get_space(scop->context)); - - for (i = 0; i < scop->n_stmt; ++i) { - struct pet_stmt *stmt = scop->stmts[i]; - - if (!pred(stmt)) - continue; - - if (stmt->n_arg > 0) - isl_die(isl_union_set_get_ctx(domain), - isl_error_unsupported, - "data dependent conditions not supported", - return isl_union_set_free(domain)); - - domain_i = isl_set_copy(scop->stmts[i]->domain); - domain = isl_union_set_add_set(domain, domain_i); - } - - return domain; -} - -/* Collect the iteration domains of the statements in "scop", - * skipping kill statements. - */ -static __isl_give isl_union_set *collect_non_kill_domains(struct pet_scop *scop) -{ - return collect_domains(scop, &is_not_kill); -} - -/* This function is used as a callback to pet_expr_foreach_call_expr - * to detect if there is any call expression in the input expression. - * Assign the value 1 to the integer that "user" points to and - * abort the search since we have found what we were looking for. - */ -static int set_has_call(__isl_keep pet_expr *expr, void *user) -{ - int *has_call = user; - - *has_call = 1; - - return -1; -} - -/* Does "expr" contain any call expressions? - */ -static int expr_has_call(__isl_keep pet_expr *expr) -{ - int has_call = 0; - - if (pet_expr_foreach_call_expr(expr, &set_has_call, &has_call) < 0 && - !has_call) - return -1; - - return has_call; -} - -/* This function is a callback for pet_tree_foreach_expr. - * If "expr" contains any call (sub)expressions, then set *has_call - * and abort the search. - */ -static int check_call(__isl_keep pet_expr *expr, void *user) -{ - int *has_call = user; - - if (expr_has_call(expr)) - *has_call = 1; - - return *has_call ? -1 : 0; -} - -/* Does "stmt" contain any call expressions? - */ -static int has_call(struct pet_stmt *stmt) -{ - int has_call = 0; - - if (pet_tree_foreach_expr(stmt->body, &check_call, &has_call) < 0 && - !has_call) - return -1; - - return has_call; -} - -/* Collect the iteration domains of the statements in "scop" - * that contain a call expression. - */ -static __isl_give isl_union_set *collect_call_domains(struct pet_scop *scop) -{ - return collect_domains(scop, &has_call); -} - -/* Given a union of "tagged" access relations of the form - * - * [S_i[...] -> R_j[]] -> A_k[...] - * - * project out the "tags" (R_j[]). - * That is, return a union of relations of the form - * - * S_i[...] -> A_k[...] - */ -static __isl_give isl_union_map *project_out_tags( - __isl_take isl_union_map *umap) -{ - return isl_union_map_domain_factor_domain(umap); -} - -/* Construct a function from tagged iteration domains to the corresponding - * untagged iteration domains with as range of the wrapped map in the domain - * the reference tags that appear in any of the reads, writes or kills. - * Store the result in ps->tagger. - * - * For example, if the statement with iteration space S[i,j] - * contains two array references R_1[] and R_2[], then ps->tagger will contain - * - * { [S[i,j] -> R_1[]] -> S[i,j]; [S[i,j] -> R_2[]] -> S[i,j] } - */ -void compute_tagger(struct ppcg_scop *ps) -{ - isl_union_map *tagged; - isl_union_pw_multi_aff *tagger; - - tagged = isl_union_map_copy(ps->tagged_reads); - tagged = isl_union_map_union(tagged, - isl_union_map_copy(ps->tagged_may_writes)); - tagged = isl_union_map_union(tagged, - isl_union_map_copy(ps->tagged_must_kills)); - tagged = isl_union_map_universe(tagged); - tagged = isl_union_set_unwrap(isl_union_map_domain(tagged)); - - tagger = isl_union_map_domain_map_union_pw_multi_aff(tagged); - - ps->tagger = tagger; -} - -/* Compute the live out accesses, i.e., the writes that are - * potentially not killed by any kills or any other writes, and - * store them in ps->live_out. - * - * We compute the "dependence" of any "kill" (an explicit kill - * or a must write) on any may write. - * The elements accessed by the may writes with a "depending" kill - * also accessing the element are definitely killed. - * The remaining may writes can potentially be live out. - * - * The result of the dependence analysis is - * - * { IW -> [IK -> A] } - * - * with IW the instance of the write statement, IK the instance of kill - * statement and A the element that was killed. - * The range factor range is - * - * { IW -> A } - * - * containing all such pairs for which there is a kill statement instance, - * i.e., all pairs that have been killed. - */ -static void compute_live_out(struct ppcg_scop *ps) -{ - isl_schedule *schedule; - isl_union_map *kills; - isl_union_map *exposed; - isl_union_map *covering; - isl_union_access_info *access; - isl_union_flow *flow; - - schedule = isl_schedule_copy(ps->schedule); - kills = isl_union_map_union(isl_union_map_copy(ps->must_writes), - isl_union_map_copy(ps->must_kills)); - access = isl_union_access_info_from_sink(kills); - access = isl_union_access_info_set_may_source(access, - isl_union_map_copy(ps->may_writes)); - access = isl_union_access_info_set_schedule(access, schedule); - flow = isl_union_access_info_compute_flow(access); - covering = isl_union_flow_get_full_may_dependence(flow); - isl_union_flow_free(flow); - - covering = isl_union_map_range_factor_range(covering); - exposed = isl_union_map_copy(ps->may_writes); - exposed = isl_union_map_subtract(exposed, covering); - ps->live_out = exposed; -} - -/* Compute the tagged flow dependences and the live_in accesses and store - * the results in ps->tagged_dep_flow and ps->live_in. - * - * We allow both the must writes and the must kills to serve as - * definite sources such that a subsequent read would not depend - * on any earlier write. The resulting flow dependences with - * a must kill as source reflect possibly uninitialized reads. - * No dependences need to be introduced to protect such reads - * (other than those imposed by potential flows from may writes - * that follow the kill). We therefore remove those flow dependences. - * This is also useful for the dead code elimination, which assumes - * the flow sources are non-kill instances. - */ -static void compute_tagged_flow_dep_only(struct ppcg_scop *ps) -{ - isl_union_pw_multi_aff *tagger; - isl_schedule *schedule; - isl_union_map *live_in; - isl_union_access_info *access; - isl_union_flow *flow; - isl_union_map *must_source; - isl_union_map *kills; - isl_union_map *tagged_flow; - - tagger = isl_union_pw_multi_aff_copy(ps->tagger); - schedule = isl_schedule_copy(ps->schedule); - schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger); - kills = isl_union_map_copy(ps->tagged_must_kills); - must_source = isl_union_map_copy(ps->tagged_must_writes); - must_source = isl_union_map_union(must_source, - isl_union_map_copy(kills)); - access = isl_union_access_info_from_sink( - isl_union_map_copy(ps->tagged_reads)); - access = isl_union_access_info_set_must_source(access, must_source); - access = isl_union_access_info_set_may_source(access, - isl_union_map_copy(ps->tagged_may_writes)); - access = isl_union_access_info_set_schedule(access, schedule); - flow = isl_union_access_info_compute_flow(access); - tagged_flow = isl_union_flow_get_may_dependence(flow); - tagged_flow = isl_union_map_subtract_domain(tagged_flow, - isl_union_map_domain(kills)); - ps->tagged_dep_flow = tagged_flow; - live_in = isl_union_flow_get_may_no_source(flow); - ps->live_in = project_out_tags(live_in); - isl_union_flow_free(flow); -} - -/* Compute ps->dep_flow from ps->tagged_dep_flow - * by projecting out the reference tags. - */ -static void derive_flow_dep_from_tagged_flow_dep(struct ppcg_scop *ps) -{ - ps->dep_flow = isl_union_map_copy(ps->tagged_dep_flow); - ps->dep_flow = isl_union_map_factor_domain(ps->dep_flow); -} - -/* Compute the flow dependences and the live_in accesses and store - * the results in ps->dep_flow and ps->live_in. - * A copy of the flow dependences, tagged with the reference tags - * is stored in ps->tagged_dep_flow. - * - * We first compute ps->tagged_dep_flow, i.e., the tagged flow dependences - * and then project out the tags. - */ -static void compute_tagged_flow_dep(struct ppcg_scop *ps) -{ - compute_tagged_flow_dep_only(ps); - derive_flow_dep_from_tagged_flow_dep(ps); -} - -/* Compute the order dependences that prevent the potential live ranges - * from overlapping. - * - * In particular, construct a union of relations - * - * [R[...] -> R_1[]] -> [W[...] -> R_2[]] - * - * where [R[...] -> R_1[]] is the range of one or more live ranges - * (i.e., a read) and [W[...] -> R_2[]] is the domain of one or more - * live ranges (i.e., a write). Moreover, the read and the write - * access the same memory element and the read occurs before the write - * in the original schedule. - * The scheduler allows some of these dependences to be violated, provided - * the adjacent live ranges are all local (i.e., their domain and range - * are mapped to the same point by the current schedule band). - * - * Note that if a live range is not local, then we need to make - * sure it does not overlap with _any_ other live range, and not - * just with the "previous" and/or the "next" live range. - * We therefore add order dependences between reads and - * _any_ later potential write. - * - * We also need to be careful about writes without a corresponding read. - * They are already prevented from moving past non-local preceding - * intervals, but we also need to prevent them from moving past non-local - * following intervals. We therefore also add order dependences from - * potential writes that do not appear in any intervals - * to all later potential writes. - * Note that dead code elimination should have removed most of these - * dead writes, but the dead code elimination may not remove all dead writes, - * so we need to consider them to be safe. - * - * The order dependences are computed by computing the "dataflow" - * from the above unmatched writes and the reads to the may writes. - * The unmatched writes and the reads are treated as may sources - * such that they would not kill order dependences from earlier - * such writes and reads. - */ -static void compute_order_dependences(struct ppcg_scop *ps) -{ - isl_union_map *reads; - isl_union_map *shared_access; - isl_union_set *matched; - isl_union_map *unmatched; - isl_union_pw_multi_aff *tagger; - isl_schedule *schedule; - isl_union_access_info *access; - isl_union_flow *flow; - - tagger = isl_union_pw_multi_aff_copy(ps->tagger); - schedule = isl_schedule_copy(ps->schedule); - schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger); - reads = isl_union_map_copy(ps->tagged_reads); - matched = isl_union_map_domain(isl_union_map_copy(ps->tagged_dep_flow)); - unmatched = isl_union_map_copy(ps->tagged_may_writes); - unmatched = isl_union_map_subtract_domain(unmatched, matched); - reads = isl_union_map_union(reads, unmatched); - access = isl_union_access_info_from_sink( - isl_union_map_copy(ps->tagged_may_writes)); - access = isl_union_access_info_set_may_source(access, reads); - access = isl_union_access_info_set_schedule(access, schedule); - flow = isl_union_access_info_compute_flow(access); - shared_access = isl_union_flow_get_may_dependence(flow); - isl_union_flow_free(flow); - - ps->tagged_dep_order = isl_union_map_copy(shared_access); - ps->dep_order = isl_union_map_factor_domain(shared_access); -} - -/* Compute those validity dependences of the program represented by "scop" - * that should be unconditionally enforced even when live-range reordering - * is used. - * - * In particular, compute the external false dependences - * as well as order dependences between sources with the same sink. - * The anti-dependences are already taken care of by the order dependences. - * The external false dependences are only used to ensure that live-in and - * live-out data is not overwritten by any writes inside the scop. - * The independences are removed from the external false dependences, - * but not from the order dependences between sources with the same sink. - * - * In particular, the reads from live-in data need to precede any - * later write to the same memory element. - * As to live-out data, the last writes need to remain the last writes. - * That is, any earlier write in the original schedule needs to precede - * the last write to the same memory element in the computed schedule. - * The possible last writes have been computed by compute_live_out. - * They may include kills, but if the last access is a kill, - * then the corresponding dependences will effectively be ignored - * since we do not schedule any kill statements. - * - * Note that the set of live-in and live-out accesses may be - * an overapproximation. There may therefore be potential writes - * before a live-in access and after a live-out access. - * - * In the presence of may-writes, there may be multiple live-ranges - * with the same sink, accessing the same memory element. - * The sources of these live-ranges need to be executed - * in the same relative order as in the original program - * since we do not know which of the may-writes will actually - * perform a write. Consider all sources that share a sink and - * that may write to the same memory element and compute - * the order dependences among them. - */ -static void compute_forced_dependences(struct ppcg_scop *ps) -{ - isl_union_map *shared_access; - isl_union_map *exposed; - isl_union_map *live_in; - isl_union_map *sink_access; - isl_union_map *shared_sink; - isl_union_access_info *access; - isl_union_flow *flow; - isl_schedule *schedule; - - exposed = isl_union_map_copy(ps->live_out); - schedule = isl_schedule_copy(ps->schedule); - access = isl_union_access_info_from_sink(exposed); - access = isl_union_access_info_set_may_source(access, - isl_union_map_copy(ps->may_writes)); - access = isl_union_access_info_set_schedule(access, schedule); - flow = isl_union_access_info_compute_flow(access); - shared_access = isl_union_flow_get_may_dependence(flow); - isl_union_flow_free(flow); - ps->dep_forced = shared_access; - - schedule = isl_schedule_copy(ps->schedule); - access = isl_union_access_info_from_sink( - isl_union_map_copy(ps->may_writes)); - access = isl_union_access_info_set_may_source(access, - isl_union_map_copy(ps->live_in)); - access = isl_union_access_info_set_schedule(access, schedule); - flow = isl_union_access_info_compute_flow(access); - live_in = isl_union_flow_get_may_dependence(flow); - isl_union_flow_free(flow); - - ps->dep_forced = isl_union_map_union(ps->dep_forced, live_in); - ps->dep_forced = isl_union_map_subtract(ps->dep_forced, - isl_union_map_copy(ps->independence)); - - schedule = isl_schedule_copy(ps->schedule); - sink_access = isl_union_map_copy(ps->tagged_dep_flow); - sink_access = isl_union_map_range_product(sink_access, - isl_union_map_copy(ps->tagged_may_writes)); - sink_access = isl_union_map_domain_factor_domain(sink_access); - access = isl_union_access_info_from_sink( - isl_union_map_copy(sink_access)); - access = isl_union_access_info_set_may_source(access, sink_access); - access = isl_union_access_info_set_schedule(access, schedule); - flow = isl_union_access_info_compute_flow(access); - shared_sink = isl_union_flow_get_may_dependence(flow); - isl_union_flow_free(flow); - ps->dep_forced = isl_union_map_union(ps->dep_forced, shared_sink); -} - -/* Remove independence from the tagged flow dependences. - * Since the user has guaranteed that source and sink of an independence - * can be executed in any order, there cannot be a flow dependence - * between them, so they can be removed from the set of flow dependences. - * However, if the source of such a flow dependence is a must write, - * then it may have killed other potential sources, which would have - * to be recovered if we were to remove those flow dependences. - * We therefore keep the flow dependences that originate in a must write, - * even if it corresponds to a known independence. - */ -static void remove_independences_from_tagged_flow(struct ppcg_scop *ps) -{ - isl_union_map *tf; - isl_union_set *indep; - isl_union_set *mw; - - tf = isl_union_map_copy(ps->tagged_dep_flow); - tf = isl_union_map_zip(tf); - indep = isl_union_map_wrap(isl_union_map_copy(ps->independence)); - tf = isl_union_map_intersect_domain(tf, indep); - tf = isl_union_map_zip(tf); - mw = isl_union_map_domain(isl_union_map_copy(ps->tagged_must_writes)); - tf = isl_union_map_subtract_domain(tf, mw); - ps->tagged_dep_flow = isl_union_map_subtract(ps->tagged_dep_flow, tf); -} - -/* Compute the dependences of the program represented by "scop" - * in case live range reordering is allowed. - * - * We compute the actual live ranges and the corresponding order - * false dependences. - * - * The independences are removed from the flow dependences - * (provided the source is not a must-write) as well as - * from the external false dependences (by compute_forced_dependences). - */ -static void compute_live_range_reordering_dependences(struct ppcg_scop *ps) -{ - compute_tagged_flow_dep_only(ps); - remove_independences_from_tagged_flow(ps); - derive_flow_dep_from_tagged_flow_dep(ps); - compute_order_dependences(ps); - compute_forced_dependences(ps); -} - -/* Compute the potential flow dependences and the potential live in - * accesses. - */ -static void compute_flow_dep(struct ppcg_scop *ps) -{ - isl_union_access_info *access; - isl_union_flow *flow; - - access = isl_union_access_info_from_sink(isl_union_map_copy(ps->reads)); - access = isl_union_access_info_set_must_source(access, - isl_union_map_copy(ps->must_writes)); - access = isl_union_access_info_set_may_source(access, - isl_union_map_copy(ps->may_writes)); - access = isl_union_access_info_set_schedule(access, - isl_schedule_copy(ps->schedule)); - flow = isl_union_access_info_compute_flow(access); - - ps->dep_flow = isl_union_flow_get_may_dependence(flow); - ps->live_in = isl_union_flow_get_may_no_source(flow); - isl_union_flow_free(flow); -} - -/* Compute the dependences of the program represented by "scop". - * Store the computed potential flow dependences - * in scop->dep_flow and the reads with potentially no corresponding writes in - * scop->live_in. - * Store the potential live out accesses in scop->live_out. - * Store the potential false (anti and output) dependences in scop->dep_false. - * - * If live range reordering is allowed, then we compute a separate - * set of order dependences and a set of external false dependences - * in compute_live_range_reordering_dependences. - */ -void compute_dependences(struct ppcg_scop *scop) -{ - isl_union_map *may_source; - isl_union_access_info *access; - isl_union_flow *flow; - - if (!scop) - return; - - compute_live_out(scop); - - if (scop->options->live_range_reordering) - compute_live_range_reordering_dependences(scop); - else if (scop->options->target != PPCG_TARGET_C) - compute_tagged_flow_dep(scop); - else - compute_flow_dep(scop); - - may_source = isl_union_map_union(isl_union_map_copy(scop->may_writes), - isl_union_map_copy(scop->reads)); - access = isl_union_access_info_from_sink( - isl_union_map_copy(scop->may_writes)); - access = isl_union_access_info_set_must_source(access, - isl_union_map_copy(scop->must_writes)); - access = isl_union_access_info_set_may_source(access, may_source); - access = isl_union_access_info_set_schedule(access, - isl_schedule_copy(scop->schedule)); - flow = isl_union_access_info_compute_flow(access); - - scop->dep_false = isl_union_flow_get_may_dependence(flow); - scop->dep_false = isl_union_map_coalesce(scop->dep_false); - isl_union_flow_free(flow); -} - -/* Eliminate dead code from ps->domain. - * - * In particular, intersect both ps->domain and the domain of - * ps->schedule with the (parts of) iteration - * domains that are needed to produce the output or for statement - * iterations that call functions. - * Also intersect the range of the dataflow dependences with - * this domain such that the removed instances will no longer - * be considered as targets of dataflow. - * - * We start with the iteration domains that call functions - * and the set of iterations that last write to an array - * (except those that are later killed). - * - * Then we add those statement iterations that produce - * something needed by the "live" statements iterations. - * We keep doing this until no more statement iterations can be added. - * To ensure that the procedure terminates, we compute the affine - * hull of the live iterations (bounded to the original iteration - * domains) each time we have added extra iterations. - */ -void eliminate_dead_code(struct ppcg_scop *ps) -{ - isl_union_set *live; - isl_union_map *dep; - isl_union_pw_multi_aff *tagger; - - live = isl_union_map_domain(isl_union_map_copy(ps->live_out)); - if (!isl_union_set_is_empty(ps->call)) { - live = isl_union_set_union(live, isl_union_set_copy(ps->call)); - live = isl_union_set_coalesce(live); - } - - dep = isl_union_map_copy(ps->dep_flow); - dep = isl_union_map_reverse(dep); - - for (;;) { - isl_union_set *extra; - - extra = isl_union_set_apply(isl_union_set_copy(live), - isl_union_map_copy(dep)); - if (isl_union_set_is_subset(extra, live)) { - isl_union_set_free(extra); - break; - } - - live = isl_union_set_union(live, extra); - live = isl_union_set_affine_hull(live); - live = isl_union_set_intersect(live, - isl_union_set_copy(ps->domain)); - } - - isl_union_map_free(dep); - - ps->domain = isl_union_set_intersect(ps->domain, - isl_union_set_copy(live)); - ps->schedule = isl_schedule_intersect_domain(ps->schedule, - isl_union_set_copy(live)); - ps->dep_flow = isl_union_map_intersect_range(ps->dep_flow, - isl_union_set_copy(live)); - tagger = isl_union_pw_multi_aff_copy(ps->tagger); - live = isl_union_set_preimage_union_pw_multi_aff(live, tagger); - ps->tagged_dep_flow = isl_union_map_intersect_range(ps->tagged_dep_flow, - live); -} - -/* Intersect "set" with the set described by "str", taking the NULL - * string to represent the universal set. - */ -static __isl_give isl_set *set_intersect_str(__isl_take isl_set *set, - const char *str) -{ - isl_ctx *ctx; - isl_set *set2; - - if (!str) - return set; - - ctx = isl_set_get_ctx(set); - set2 = isl_set_read_from_str(ctx, str); - set = isl_set_intersect(set, set2); - - return set; -} - -void *ppcg_scop_free(struct ppcg_scop *ps) -{ - if (!ps) - return NULL; - - isl_set_free(ps->context); - isl_union_set_free(ps->domain); - isl_union_set_free(ps->call); - isl_union_map_free(ps->tagged_reads); - isl_union_map_free(ps->reads); - isl_union_map_free(ps->live_in); - isl_union_map_free(ps->tagged_may_writes); - isl_union_map_free(ps->tagged_must_writes); - isl_union_map_free(ps->may_writes); - isl_union_map_free(ps->must_writes); - isl_union_map_free(ps->live_out); - isl_union_map_free(ps->tagged_must_kills); - isl_union_map_free(ps->must_kills); - isl_union_map_free(ps->tagged_dep_flow); - isl_union_map_free(ps->dep_flow); - isl_union_map_free(ps->dep_false); - isl_union_map_free(ps->dep_forced); - isl_union_map_free(ps->tagged_dep_order); - isl_union_map_free(ps->dep_order); - isl_schedule_free(ps->schedule); - isl_union_pw_multi_aff_free(ps->tagger); - isl_union_map_free(ps->independence); - isl_id_to_ast_expr_free(ps->names); - - free(ps); - - return NULL; -} - -/* Extract a ppcg_scop from a pet_scop. - * - * The constructed ppcg_scop refers to elements from the pet_scop - * so the pet_scop should not be freed before the ppcg_scop. - */ -static struct ppcg_scop *ppcg_scop_from_pet_scop(struct pet_scop *scop, - struct ppcg_options *options) -{ - int i; - isl_ctx *ctx; - struct ppcg_scop *ps; - - if (!scop) - return NULL; - - ctx = isl_set_get_ctx(scop->context); - - ps = isl_calloc_type(ctx, struct ppcg_scop); - if (!ps) - return NULL; - - ps->names = collect_names(scop); - ps->options = options; - ps->start = pet_loc_get_start(scop->loc); - ps->end = pet_loc_get_end(scop->loc); - ps->context = isl_set_copy(scop->context); - ps->context = set_intersect_str(ps->context, options->ctx); - if (options->non_negative_parameters) { - isl_space *space = isl_set_get_space(ps->context); - isl_set *nn = isl_set_nat_universe(space); - ps->context = isl_set_intersect(ps->context, nn); - } - ps->domain = collect_non_kill_domains(scop); - ps->call = collect_call_domains(scop); - ps->tagged_reads = pet_scop_get_tagged_may_reads(scop); - ps->reads = pet_scop_get_may_reads(scop); - ps->tagged_may_writes = pet_scop_get_tagged_may_writes(scop); - ps->may_writes = pet_scop_get_may_writes(scop); - ps->tagged_must_writes = pet_scop_get_tagged_must_writes(scop); - ps->must_writes = pet_scop_get_must_writes(scop); - ps->tagged_must_kills = pet_scop_get_tagged_must_kills(scop); - ps->must_kills = pet_scop_get_must_kills(scop); - ps->schedule = isl_schedule_copy(scop->schedule); - ps->pet = scop; - ps->independence = isl_union_map_empty(isl_set_get_space(ps->context)); - for (i = 0; i < scop->n_independence; ++i) - ps->independence = isl_union_map_union(ps->independence, - isl_union_map_copy(scop->independences[i]->filter)); - - compute_tagger(ps); - compute_dependences(ps); - eliminate_dead_code(ps); - - if (!ps->context || !ps->domain || !ps->call || !ps->reads || - !ps->may_writes || !ps->must_writes || !ps->tagged_must_kills || - !ps->must_kills || !ps->schedule || !ps->independence || !ps->names) - return ppcg_scop_free(ps); - - return ps; -} - -/* Internal data structure for ppcg_transform. - */ -struct ppcg_transform_data { - struct ppcg_options *options; - __isl_give isl_printer *(*transform)(__isl_take isl_printer *p, - struct ppcg_scop *scop, void *user); - void *user; -}; - -/* Should we print the original code? - * That is, does "scop" involve any data dependent conditions or - * nested expressions that cannot be handled by pet_stmt_build_ast_exprs? - */ -static int print_original(struct pet_scop *scop, struct ppcg_options *options) -{ - if (!pet_scop_can_build_ast_exprs(scop)) { - if (options->debug->verbose) - fprintf(stdout, "Printing original code because " - "some index expressions cannot currently " - "be printed\n"); - return 1; - } - - if (pet_scop_has_data_dependent_conditions(scop)) { - if (options->debug->verbose) - fprintf(stdout, "Printing original code because " - "input involves data dependent conditions\n"); - return 1; - } - - return 0; -} - -/* Callback for pet_transform_C_source that transforms - * the given pet_scop to a ppcg_scop before calling the - * ppcg_transform callback. - * - * If "scop" contains any data dependent conditions or if we may - * not be able to print the transformed program, then just print - * the original code. - */ -static __isl_give isl_printer *transform(__isl_take isl_printer *p, - struct pet_scop *scop, void *user) -{ - struct ppcg_transform_data *data = user; - struct ppcg_scop *ps; - - if (print_original(scop, data->options)) { - p = pet_scop_print_original(scop, p); - pet_scop_free(scop); - return p; - } - - scop = pet_scop_align_params(scop); - ps = ppcg_scop_from_pet_scop(scop, data->options); - - p = data->transform(p, ps, data->user); - - ppcg_scop_free(ps); - pet_scop_free(scop); - - return p; -} - -/* Transform the C source file "input" by rewriting each scop - * through a call to "transform". - * The transformed C code is written to "out". - * - * This is a wrapper around pet_transform_C_source that transforms - * the pet_scop to a ppcg_scop before calling "fn". - */ -int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out, - struct ppcg_options *options, - __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, - struct ppcg_scop *scop, void *user), void *user) -{ - struct ppcg_transform_data data = { options, fn, user }; - return pet_transform_C_source(ctx, input, out, &transform, &data); -} - -/* Check consistency of options. - * - * Return -1 on error. - */ -static int check_options(isl_ctx *ctx) -{ - struct options *options; - - options = isl_ctx_peek_options(ctx, &options_args); - if (!options) - isl_die(ctx, isl_error_internal, - "unable to find options", return -1); - - if (options->ppcg->openmp && - !isl_options_get_ast_build_atomic_upper_bound(ctx)) - isl_die(ctx, isl_error_invalid, - "OpenMP requires atomic bounds", return -1); - - return 0; -} - -#if 0 -int main(int argc, char **argv) -{ - int r; - isl_ctx *ctx; - struct options *options; - - options = options_new_with_defaults(); - assert(options); - - ctx = isl_ctx_alloc_with_options(&options_args, options); - ppcg_options_set_target_defaults(options->ppcg); - isl_options_set_ast_build_detect_min_max(ctx, 1); - isl_options_set_ast_print_macro_once(ctx, 1); - isl_options_set_schedule_whole_component(ctx, 0); - isl_options_set_schedule_maximize_band_depth(ctx, 1); - isl_options_set_schedule_maximize_coincidence(ctx, 1); - pet_options_set_encapsulate_dynamic_control(ctx, 1); - argc = options_parse(options, argc, argv, ISL_ARG_ALL); - - if (check_options(ctx) < 0) - r = EXIT_FAILURE; - else if (options->ppcg->target == PPCG_TARGET_CUDA) - r = generate_cuda(ctx, options->ppcg, options->input); - else if (options->ppcg->target == PPCG_TARGET_OPENCL) - r = generate_opencl(ctx, options->ppcg, options->input, - options->output); - else - r = generate_cpu(ctx, options->ppcg, options->input, - options->output); - - isl_ctx_free(ctx); - - return r; -} -#endif diff --git a/polly/lib/External/ppcg/ppcg_options.h b/polly/lib/External/ppcg/ppcg_options.h deleted file mode 100644 --- a/polly/lib/External/ppcg/ppcg_options.h +++ /dev/null @@ -1,100 +0,0 @@ -#ifndef PPCG_OPTIONS_H -#define PPCG_OPTIONS_H - -#include -#include - -struct ppcg_debug_options { - int dump_schedule_constraints; - int dump_schedule; - int dump_final_schedule; - int dump_sizes; - int verbose; -}; - -struct ppcg_options { - struct isl_options *isl; - struct ppcg_debug_options *debug; - - /* Group chains of consecutive statements before scheduling. */ - int group_chains; - - /* Use isl to compute a schedule replacing the original schedule. */ - int reschedule; - int scale_tile_loops; - int wrap; - - /* Assume all parameters are non-negative. */ - int non_negative_parameters; - char *ctx; - char *sizes; - - /* Perform tiling (C target). */ - int tile; - int tile_size; - - /* Isolate full tiles from partial tiles. */ - int isolate_full_tiles; - - /* Take advantage of private memory. */ - int use_private_memory; - - /* Take advantage of shared memory. */ - int use_shared_memory; - - /* Maximal amount of shared memory. */ - int max_shared_memory; - - /* The target we generate code for. */ - int target; - - /* Generate OpenMP macros (C target only). */ - int openmp; - - /* Linearize all device arrays. */ - int linearize_device_arrays; - - /* Allow the use of GNU extensions in generated code. */ - int allow_gnu_extensions; - - /* Allow live range to be reordered. */ - int live_range_reordering; - - /* Allow hybrid tiling whenever a suitable input pattern is found. */ - int hybrid; - - /* Unroll the code for copying to/from shared memory. */ - int unroll_copy_shared; - /* Unroll code inside tile on GPU targets. */ - int unroll_gpu_tile; - - /* Options to pass to the OpenCL compiler. */ - char *opencl_compiler_options; - /* Prefer GPU device over CPU. */ - int opencl_use_gpu; - /* Number of files to include. */ - int opencl_n_include_file; - /* Files to include. */ - const char **opencl_include_files; - /* Print definitions of types in kernels. */ - int opencl_print_kernel_types; - /* Embed OpenCL kernel code in host code. */ - int opencl_embed_kernel_code; - - /* Name of file for saving isl computed schedule or NULL. */ - char *save_schedule_file; - /* Name of file for loading schedule or NULL. */ - char *load_schedule_file; -}; - -ISL_ARG_DECL(ppcg_debug_options, struct ppcg_debug_options, - ppcg_debug_options_args) -ISL_ARG_DECL(ppcg_options, struct ppcg_options, ppcg_options_args) - -#define PPCG_TARGET_C 0 -#define PPCG_TARGET_CUDA 1 -#define PPCG_TARGET_OPENCL 2 - -void ppcg_options_set_target_defaults(struct ppcg_options *options); - -#endif diff --git a/polly/lib/External/ppcg/ppcg_options.c b/polly/lib/External/ppcg/ppcg_options.c deleted file mode 100644 --- a/polly/lib/External/ppcg/ppcg_options.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2010-2011 INRIA Saclay - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, - * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, - * 91893 Orsay, France - */ - -#include "ppcg_options.h" - -static struct isl_arg_choice target[] = { - {"c", PPCG_TARGET_C}, - {"cuda", PPCG_TARGET_CUDA}, - {"opencl", PPCG_TARGET_OPENCL}, - {0} -}; - -/* Set defaults that depend on the target. - * In particular, set --schedule-outer-coincidence iff target is a GPU. - */ -void ppcg_options_set_target_defaults(struct ppcg_options *options) -{ - char *argv[2] = { NULL }; - - argv[0] = "ppcg_options_set_target_defaults"; - if (options->target == PPCG_TARGET_C) - argv[1] = "--no-schedule-outer-coincidence"; - else - argv[1] = "--schedule-outer-coincidence"; - - isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL); -} - -/* Callback that is called whenever the "target" option is set (to "val"). - * The callback is called after target has been updated. - * - * Call ppcg_options_set_target_defaults to reset the target-dependent options. - */ -static int set_target(void *opt, unsigned val) -{ - struct ppcg_options *options = opt; - - ppcg_options_set_target_defaults(options); - - return 0; -} - -ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args) -ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0, - "dump-schedule-constraints", 0, "dump schedule constraints") -ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule, 0, - "dump-schedule", 0, "dump isl computed schedule") -ISL_ARG_BOOL(struct ppcg_debug_options, dump_final_schedule, 0, - "dump-final-schedule", 0, "dump PPCG computed schedule") -ISL_ARG_BOOL(struct ppcg_debug_options, dump_sizes, 0, - "dump-sizes", 0, - "dump effectively used per kernel tile, grid and block sizes") -ISL_ARG_BOOL(struct ppcg_debug_options, verbose, 'v', "verbose", 0, NULL) -ISL_ARGS_END - -ISL_ARGS_START(struct ppcg_options, ppcg_opencl_options_args) -ISL_ARG_STR(struct ppcg_options, opencl_compiler_options, 0, "compiler-options", - "options", NULL, "options to pass to the OpenCL compiler") -ISL_ARG_BOOL(struct ppcg_options, opencl_use_gpu, 0, "use-gpu", 1, - "use GPU device (if available)") -ISL_ARG_STR_LIST(struct ppcg_options, opencl_n_include_file, - opencl_include_files, 0, "include-file", "filename", - "file to #include in generated OpenCL code") -ISL_ARG_BOOL(struct ppcg_options, opencl_print_kernel_types, 0, - "print-kernel-types", 1, - "print definitions of types in the kernel file") -ISL_ARG_BOOL(struct ppcg_options, opencl_embed_kernel_code, 0, - "embed-kernel-code", 0, "embed kernel code into host code") -ISL_ARGS_END - -ISL_ARGS_START(struct ppcg_options, ppcg_options_args) -ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options") -ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args, - "debugging options") -ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1, - "group chains of interdependent statements that are executed " - "consecutively in the original schedule before scheduling") -ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1, - "replace original schedule by isl computed schedule") -ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0, - "scale-tile-loops", 1, NULL) -ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL) -ISL_ARG_BOOL(struct ppcg_options, use_shared_memory, 0, "shared-memory", 1, - "use shared memory in kernel code") -ISL_ARG_BOOL(struct ppcg_options, use_private_memory, 0, "private-memory", 1, - "use private memory in kernel code") -ISL_ARG_STR(struct ppcg_options, ctx, 0, "ctx", "context", NULL, - "Constraints on parameters") -ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0, - "assume-non-negative-parameters", 0, - "assume all parameters are non-negative)") -ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0, - "perform tiling (C target)") -ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL) -ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles", - 0, "isolate full tiles from partial tiles (hybrid tiling)") -ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL, - "Per kernel tile, grid and block sizes") -ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0, - "max-shared-memory", "size", 8192, "maximal amount of shared memory") -ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0, - "Generate OpenMP macros (only for C target)") -ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target, - &set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA, - "the target to generate code for") -ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0, - "linearize-device-arrays", 1, - "linearize all device arrays, even those of fixed size") -ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0, - "allow-gnu-extensions", 1, - "allow the use of GNU extensions in generated code") -ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0, - "live-range-reordering", 1, - "allow successive live ranges on the same memory element " - "to be reordered") -ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0, - "apply hybrid tiling whenever a suitable input pattern is found " - "(GPU targets)") -ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared", - 0, "unroll code for copying to/from shared memory") -ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0, - "unroll code inside tile on GPU targets") -ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options") -ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule", - "file", NULL, "save isl computed schedule to ") -ISL_ARG_STR(struct ppcg_options, load_schedule_file, 0, "load-schedule", - "file", NULL, "load schedule from , " - "using it instead of an isl computed schedule") -ISL_ARGS_END diff --git a/polly/lib/External/ppcg/print.h b/polly/lib/External/ppcg/print.h deleted file mode 100644 --- a/polly/lib/External/ppcg/print.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef PRINT_H -#define PRINT_H - -#include - -#include "ppcg.h" - -extern const char *ppcg_min; -extern const char *ppcg_max; -extern const char *ppcg_fdiv_q; - -__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p); -__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p); - -__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p); -__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p, - const char *min, const char *max); -__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type, - __isl_take isl_printer *p); -__isl_give isl_printer *ppcg_ast_expr_print_macros( - __isl_keep isl_ast_expr *expr, __isl_take isl_printer *p); -__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p, - __isl_keep isl_id_to_ast_expr *ref2expr); -__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p, - __isl_keep isl_ast_node *node); - -__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size, - __isl_keep isl_ast_build *build); - -__isl_give isl_printer *ppcg_print_declaration_with_size( - __isl_take isl_printer *p, const char *base_type, - __isl_keep isl_ast_expr *size); -__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p, - struct pet_array *array, __isl_keep isl_ast_build *build); -__isl_give isl_printer *ppcg_print_exposed_declarations( - __isl_take isl_printer *p, struct ppcg_scop *scop); -__isl_give isl_printer *ppcg_print_hidden_declarations( - __isl_take isl_printer *p, struct ppcg_scop *scop); - -#endif diff --git a/polly/lib/External/ppcg/print.c b/polly/lib/External/ppcg/print.c deleted file mode 100644 --- a/polly/lib/External/ppcg/print.c +++ /dev/null @@ -1,461 +0,0 @@ -/* - * Copyright 2012-2013 Ecole Normale Superieure - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France - */ - -#include -#include -#include - -#include "print.h" -#include "util.h" - -__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "{"); - p = isl_printer_end_line(p); - p = isl_printer_indent(p, 2); - return p; -} - -__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p) -{ - p = isl_printer_indent(p, -2); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "}"); - p = isl_printer_end_line(p); - return p; -} - -/* Names of notes that keep track of whether min/max - * macro definitions have already been printed. - */ -static const char *ppcg_max_printed = "ppcg_max_printed"; -static const char *ppcg_min_printed = "ppcg_min_printed"; - -/* Has the macro definition corresponding to "note_name" been printed - * to "p" before? - * That is, does "p" have an associated "note_name" note? - */ -static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name) -{ - isl_ctx *ctx; - isl_id *id; - isl_bool printed; - - if (!p) - return isl_bool_error; - - ctx = isl_printer_get_ctx(p); - id = isl_id_alloc(ctx, note_name, NULL); - printed = isl_printer_has_note(p, id); - isl_id_free(id); - - return printed; -} - -/* Keep track of the fact that the macro definition corresponding - * to "note_name" has been printed to "p" by attaching a note with - * that name. The value of the note is of no importance, but it - * has to be a valid isl_id, so the note identifier is reused - * as the note. - */ -static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p, - const char *note_name) -{ - isl_ctx *ctx; - isl_id *id; - - if (!p) - return NULL; - - ctx = isl_printer_get_ctx(p); - id = isl_id_alloc(ctx, note_name, NULL); - return isl_printer_set_note(p, id, isl_id_copy(id)); -} - -/* Print a macro definition "def" for the macro "name" to "p", - * unless such a macro definition has been printed to "p" before. - * "note_name" is used as the name of the note that keeps track - * of whether this printing has happened. - */ -static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p, - const char *name, const char *def, const char *note_name) -{ - isl_bool printed; - - printed = printed_before(p, note_name); - if (printed < 0) - return isl_printer_free(p); - if (printed) - return p; - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "#define "); - p = isl_printer_print_str(p, name); - p = isl_printer_print_str(p, def); - p = isl_printer_end_line(p); - - p = mark_printed(p, note_name); - - return p; -} - -/* Structure for keeping track of definitions of some macros. - */ -struct ppcg_macros { - const char *min; - const char *max; -}; - -/* Free the memory allocated by a struct ppcg_macros. - */ -static void ppcg_macros_free(void *user) -{ - free(user); -} - -/* Default macro definitions (when GNU extensions are allowed). - */ -struct ppcg_macros ppcg_macros_default = { - .min = "(x,y) " - "({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); " - "_x < _y ? _x : _y; })", - .max = "(x,y) " - "({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); " - "_x > _y ? _x : _y; })", -}; - -/* Name used for the note that keeps track of macro definitions. - */ -static const char *ppcg_macros = "ppcg_macros"; - -/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max - * to "min" and "max" and store them in "p". - * - * In particular, create a ppcg_macros object and attach it - * as a note to the printer. - */ -__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p, - const char *min, const char *max) -{ - isl_ctx *ctx; - isl_id *id, *macros_id; - struct ppcg_macros *macros; - - if (!p) - return NULL; - - ctx = isl_printer_get_ctx(p); - macros = isl_alloc_type(ctx, struct ppcg_macros); - if (!macros) - return isl_printer_free(p); - macros->min = min; - macros->max = max; - id = isl_id_alloc(ctx, ppcg_macros, NULL); - macros_id = isl_id_alloc(ctx, NULL, macros); - if (!macros_id) - ppcg_macros_free(macros); - else - macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free); - - p = isl_printer_set_note(p, id, macros_id); - - return p; -} - -/* Return the ppcg_macros object that holds the currently active - * macro definitions in "p". - * If "p" has a note with macro definitions, then return those. - * Otherwise, return the default macro definitions. - */ -static struct ppcg_macros *get_macros(__isl_keep isl_printer *p) -{ - isl_id *id; - isl_bool has_macros; - struct ppcg_macros *macros; - - id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL); - has_macros = isl_printer_has_note(p, id); - if (has_macros < 0 || !has_macros) { - isl_id_free(id); - if (has_macros < 0) - return NULL; - return &ppcg_macros_default; - } - id = isl_printer_get_note(p, id); - macros = isl_id_get_user(id); - isl_id_free(id); - - return macros; -} - -/* Print the currently active macro definition for ppcg_max. - */ -static __isl_give isl_printer *print_max(__isl_take isl_printer *p) -{ - struct ppcg_macros *macros; - - macros = get_macros(p); - if (!macros) - return isl_printer_free(p); - return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed); -} - -/* Print the currently active macro definition for ppcg_min. - */ -static __isl_give isl_printer *print_min(__isl_take isl_printer *p) -{ - struct ppcg_macros *macros; - - macros = get_macros(p); - if (!macros) - return isl_printer_free(p); - return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed); -} - -/* Print a macro definition for "type" to "p". - * If GNU extensions are allowed, then print a specialized definition - * for isl_ast_op_min and isl_ast_op_max. - * Otherwise, use the default isl definition. - */ -__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type, - __isl_take isl_printer *p) -{ - isl_ctx *ctx; - struct ppcg_options *options; - - if (!p) - return NULL; - - ctx = isl_printer_get_ctx(p); - options = isl_ctx_peek_options(ctx, &ppcg_options_args); - if (!options || !options->allow_gnu_extensions) - return isl_ast_op_type_print_macro(type, p); - - switch (type) { - case isl_ast_op_max: - return print_max(p); - case isl_ast_op_min: - return print_min(p); - default: - return isl_ast_op_type_print_macro(type, p); - } -} - -/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type - * callback that prints a macro definition for "type". - */ -static isl_stat print_macro(enum isl_ast_op_type type, void *user) -{ - isl_printer **p = user; - - *p = ppcg_print_macro(type, *p); - if (!*p) - return isl_stat_error; - - return isl_stat_ok; -} - -/* Print the required macros for "expr". - */ -__isl_give isl_printer *ppcg_ast_expr_print_macros( - __isl_keep isl_ast_expr *expr, __isl_take isl_printer *p) -{ - if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0) - return isl_printer_free(p); - return p; -} - -/* isl_id_to_ast_expr_foreach callback that prints the required - * macro definitions for "val". - */ -static isl_stat print_expr_macros(__isl_take isl_id *key, - __isl_take isl_ast_expr *val, void *user) -{ - isl_printer **p = user; - - *p = ppcg_ast_expr_print_macros(val, *p); - isl_id_free(key); - isl_ast_expr_free(val); - - if (!*p) - return isl_stat_error; - return isl_stat_ok; -} - -/* Print the required macro definitions for the body of a statement in which - * the access expressions are replaced by the isl_ast_expr objects - * in "ref2expr". - */ -__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p, - __isl_keep isl_id_to_ast_expr *ref2expr) -{ - if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0) - return isl_printer_free(p); - return p; -} - -/* Print the required macros for "node". - */ -__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p, - __isl_keep isl_ast_node *node) -{ - if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0) - return isl_printer_free(p); - return p; -} - -/* Names used for the macros that may appear in a printed isl AST. - */ -const char *ppcg_min = "ppcg_min"; -const char *ppcg_max = "ppcg_max"; -const char *ppcg_fdiv_q = "ppcg_fdiv_q"; - -/* Set the names of the macros that may appear in a printed isl AST. - */ -__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p) -{ - p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min); - p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max); - p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q); - - return p; -} - -/* Given a multi affine expression "mpa" without domain, modify it to have - * the schedule space of "build" as domain. - * - * If the schedule space of "build" is a parameter space, then nothing - * needs to be done. - * Otherwise, "mpa" is first given a 0D domain and then it is combined - * with a mapping from the schedule space of "build" to the same 0D domain. - */ -__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff( - __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build) -{ - isl_bool params; - isl_space *space; - isl_multi_aff *ma; - - space = isl_ast_build_get_schedule_space(build); - params = isl_space_is_params(space); - if (params < 0 || params) { - isl_space_free(space); - if (params < 0) - return isl_multi_pw_aff_free(mpa); - return mpa; - } - space = isl_space_from_domain(space); - ma = isl_multi_aff_zero(space); - mpa = isl_multi_pw_aff_from_range(mpa); - mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma); - - return mpa; -} - -/* Build an access AST expression from "size" using "build". - * "size" does not have a domain, but "build" may have a proper schedule space. - * First modify "size" to have that schedule space as domain. - */ -__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size, - __isl_keep isl_ast_build *build) -{ - size = ppcg_attach_multi_pw_aff(size, build); - return isl_ast_build_access_from_multi_pw_aff(build, size); -} - -/* Print a declaration for an array with element type "base_type" and - * size "size" to "p". - */ -__isl_give isl_printer *ppcg_print_declaration_with_size( - __isl_take isl_printer *p, const char *base_type, - __isl_keep isl_ast_expr *size) -{ - if (!base_type || !size) - return isl_printer_free(p); - - p = ppcg_ast_expr_print_macros(size, p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, base_type); - p = isl_printer_print_str(p, " "); - p = isl_printer_print_ast_expr(p, size); - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); - - return p; -} - -/* Print a declaration for array "array" to "p", using "build" - * to simplify any size expressions. - * - * The size is computed from the extent of the array and is - * subsequently converted to an "access expression" by "build". - */ -__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p, - struct pet_array *array, __isl_keep isl_ast_build *build) -{ - isl_multi_pw_aff *size; - isl_ast_expr *expr; - - if (!array) - return isl_printer_free(p); - - size = ppcg_size_from_extent(isl_set_copy(array->extent)); - expr = isl_ast_build_access_from_multi_pw_aff(build, size); - p = ppcg_print_declaration_with_size(p, array->element_type, expr); - isl_ast_expr_free(expr); - - return p; -} - -/* Print declarations for the arrays in "scop" that are declared - * and that are exposed (if exposed == 1) or not exposed (if exposed == 0). - */ -static __isl_give isl_printer *print_declarations(__isl_take isl_printer *p, - struct ppcg_scop *scop, int exposed) -{ - int i; - isl_ast_build *build; - - if (!scop) - return isl_printer_free(p); - - build = isl_ast_build_from_context(isl_set_copy(scop->context)); - for (i = 0; i < scop->pet->n_array; ++i) { - struct pet_array *array = scop->pet->arrays[i]; - - if (!array->declared) - continue; - if (array->exposed != exposed) - continue; - - p = ppcg_print_declaration(p, array, build); - } - isl_ast_build_free(build); - - return p; -} - -/* Print declarations for the arrays in "scop" that are declared - * and exposed to the code after the scop. - */ -__isl_give isl_printer *ppcg_print_exposed_declarations( - __isl_take isl_printer *p, struct ppcg_scop *scop) -{ - return print_declarations(p, scop, 1); -} - -/* Print declarations for the arrays in "scop" that are declared, - * but not exposed to the code after the scop. - */ -__isl_give isl_printer *ppcg_print_hidden_declarations( - __isl_take isl_printer *p, struct ppcg_scop *scop) -{ - return print_declarations(p, scop, 0); -} diff --git a/polly/lib/External/ppcg/schedule.h b/polly/lib/External/ppcg/schedule.h deleted file mode 100644 --- a/polly/lib/External/ppcg/schedule.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _SCHEDULE_H -#define _SCHEDULE_H - -#include -#include -#include -#include - -#include "ppcg_options.h" - -__isl_give isl_set *parametrization(__isl_take isl_space *space, - int len, int first, __isl_keep isl_id_list *names); - -__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx, - struct ppcg_options *options, - __isl_give isl_schedule *(*compute)(void *user), void *user); - -__isl_give isl_schedule_node *ppcg_set_schedule_node_type( - __isl_take isl_schedule_node *node, enum isl_ast_loop_type type); - -#endif diff --git a/polly/lib/External/ppcg/schedule.c b/polly/lib/External/ppcg/schedule.c deleted file mode 100644 --- a/polly/lib/External/ppcg/schedule.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright 2010-2011 INRIA Saclay - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, - * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, - * 91893 Orsay, France - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "schedule.h" - -/* Add parameters with identifiers "ids" to "set". - */ -static __isl_give isl_set *add_params(__isl_take isl_set *set, - __isl_keep isl_id_list *ids) -{ - int i, n; - unsigned nparam; - - n = isl_id_list_n_id(ids); - - nparam = isl_set_dim(set, isl_dim_param); - set = isl_set_add_dims(set, isl_dim_param, n); - - for (i = 0; i < n; ++i) { - isl_id *id; - - id = isl_id_list_get_id(ids, i); - set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id); - } - - return set; -} - -/* Equate the dimensions of "set" starting at "first" to - * freshly created parameters with identifiers "ids". - * The number of equated dimensions is equal to the number of elements in "ids". - */ -static __isl_give isl_set *parametrize(__isl_take isl_set *set, - int first, __isl_keep isl_id_list *ids) -{ - int i, n; - unsigned nparam; - - nparam = isl_set_dim(set, isl_dim_param); - - set = add_params(set, ids); - - n = isl_id_list_n_id(ids); - for (i = 0; i < n; ++i) - set = isl_set_equate(set, isl_dim_param, nparam + i, - isl_dim_set, first + i); - - return set; -} - -/* Given a parameter space "space", create a set of dimension "len" - * of which the dimensions starting at "first" are equated to - * freshly created parameters with identifiers "ids". - */ -__isl_give isl_set *parametrization(__isl_take isl_space *space, - int len, int first, __isl_keep isl_id_list *ids) -{ - isl_set *set; - - space = isl_space_set_from_params(space); - space = isl_space_add_dims(space, isl_dim_set, len); - set = isl_set_universe(space); - - return parametrize(set, first, ids); -} - -/* Load and return a schedule from a file called "filename". - */ -static __isl_give isl_schedule *load_schedule(isl_ctx *ctx, - const char *filename) -{ - FILE *file; - isl_schedule *schedule; - - file = fopen(filename, "r"); - if (!file) { - fprintf(stderr, "Unable to open '%s' for reading\n", filename); - return NULL; - } - schedule = isl_schedule_read_from_file(ctx, file); - fclose(file); - - return schedule; -} - -/* Save the schedule "schedule" to a file called "filename". - * The schedule is printed in block style. - */ -static void save_schedule(__isl_keep isl_schedule *schedule, - const char *filename) -{ - FILE *file; - isl_ctx *ctx; - isl_printer *p; - - if (!schedule) - return; - - file = fopen(filename, "w"); - if (!file) { - fprintf(stderr, "Unable to open '%s' for writing\n", filename); - return; - } - ctx = isl_schedule_get_ctx(schedule); - p = isl_printer_to_file(ctx, file); - p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); - p = isl_printer_print_schedule(p, schedule); - isl_printer_free(p); - fclose(file); -} - -/* Obtain a schedule, either by reading it form a file - * or by computing it using "compute". - * Also take care of saving the computed schedule and/or - * dumping the obtained schedule if requested by the user. - */ -__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx, - struct ppcg_options *options, - __isl_give isl_schedule *(*compute)(void *user), void *user) -{ - isl_schedule *schedule; - - if (options->load_schedule_file) { - schedule = load_schedule(ctx, options->load_schedule_file); - } else { - schedule = compute(user); - if (options->save_schedule_file) - save_schedule(schedule, options->save_schedule_file); - } - if (options->debug->dump_schedule) - isl_schedule_dump(schedule); - - return schedule; -} - -/* Mark all dimensions in the band node "node" to be of "type". - */ -__isl_give isl_schedule_node *ppcg_set_schedule_node_type( - __isl_take isl_schedule_node *node, enum isl_ast_loop_type type) -{ - int i, n; - - n = isl_schedule_node_band_n_member(node); - for (i = 0; i < n; ++i) - node = isl_schedule_node_band_member_set_ast_loop_type(node, i, - type); - - return node; -} diff --git a/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c b/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c +++ /dev/null @@ -1,49 +0,0 @@ -#include - -int main() -{ - int A[2][1000][1000]; - int B[2][1000][1000]; - -#pragma scop - { - for (int i = 0; i < 256; ++i) - for (int j = 0; j < 256; ++j) - if (j % 8 <= 2 || j % 8 >= 6) - A[1][i][j] = B[1][j][i]; - } -#pragma endscop - -/* - -When compiled with: - -./ppcg tests/allow-sparse-copy-in.c --no-linearize-device-arrays - --on-error=abort --sizes='{kernel[i]->tile[8,8]; kernel[i]->block[1,8]}' - --max-shared-memory=-1 --unroll-copy-shared - -this originally resulted in the following copy-in code: - - shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1]; - shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1]; - shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1]; - shared_B[0][3][t1] = B[1][8 * b1 + 3][8 * b0 + t1]; - shared_B[0][4][t1] = B[1][8 * b1 + 4][8 * b0 + t1]; - shared_B[0][5][t1] = B[1][8 * b1 + 5][8 * b0 + t1]; - shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1]; - shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1]; - -whereas we only want to only perform copies that are actually needed: - - shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1]; - shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1]; - shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1]; - shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1]; - shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1]; -*/ - for (int i = 0; i < 100; ++i) - if (A[1][0][i] != i) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/call.c b/polly/lib/External/ppcg/tests/call.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/call.c +++ /dev/null @@ -1,29 +0,0 @@ -#include - -void copy_summary(int b[1000], int a[1000], int pos) -{ - b[pos] = 0; - int c = a[pos]; -} - -#ifdef pencil_access -__attribute__((pencil_access(copy_summary))) -#endif -void copy(int b[1000], int a[1000], int pos); - -int main() -{ - int a[1000], b[1000]; - - for (int i = 0; i < 1000; ++i) - a[i] = i; -#pragma scop - for (int i = 0; i < 1000; ++i) - copy(b, a, i); -#pragma endscop - for (int i = 0; i < 1000; ++i) - if (b[i] != a[i]) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/call2.c b/polly/lib/External/ppcg/tests/call2.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/call2.c +++ /dev/null @@ -1,29 +0,0 @@ -#include - -void copy_summary(int b[1000], int a[1000], int pos) -{ - b[pos] = 0; - int c = a[pos]; -} - -#ifdef pencil_access -__attribute__((pencil_access(copy_summary))) -#endif -void copy(int b[1000], int a[1000], int pos); - -int main() -{ - int a[2][1000]; - - for (int i = 0; i < 1000; ++i) - a[0][i] = i; -#pragma scop - for (int i = 0; i < 1000; ++i) - copy(a[1], a[0], i); -#pragma endscop - for (int i = 0; i < 1000; ++i) - if (a[1][i] != a[0][i]) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/call2_opencl_functions.cl b/polly/lib/External/ppcg/tests/call2_opencl_functions.cl deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/call2_opencl_functions.cl +++ /dev/null @@ -1,4 +0,0 @@ -void copy(__global int b[1000], __global int a[1000], int pos) -{ - b[pos] = a[pos]; -} diff --git a/polly/lib/External/ppcg/tests/call3.c b/polly/lib/External/ppcg/tests/call3.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/call3.c +++ /dev/null @@ -1,32 +0,0 @@ -#include - -void copy_summary(int b[100], int a[100]) -{ - for (int i = 0; i < 100; ++i) { - b[i] = 0; - int c = a[i]; - } -} - -#ifdef pencil_access -__attribute__((pencil_access(copy_summary))) -#endif -void copy(int b[100], int a[100]); - -int main() -{ - int A[100][100], B[100]; - - for (int i = 0; i < 100; ++i) - B[i] = i; -#pragma scop - for (int i = 0; i < 100; ++i) - copy(A[i], B); -#pragma endscop - for (int i = 0; i < 100; ++i) - for (int j = 0; j < 100; ++j) - if (A[j][i] != B[i]) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/call3_opencl_functions.cl b/polly/lib/External/ppcg/tests/call3_opencl_functions.cl deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/call3_opencl_functions.cl +++ /dev/null @@ -1,5 +0,0 @@ -void copy(__global int b[100], __global int a[100]) -{ - for (int i = 0; i < 100; ++i) - b[i] = a[i]; -} diff --git a/polly/lib/External/ppcg/tests/call_opencl_functions.cl b/polly/lib/External/ppcg/tests/call_opencl_functions.cl deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/call_opencl_functions.cl +++ /dev/null @@ -1,4 +0,0 @@ -void copy(__global int b[1000], __global int a[1000], int pos) -{ - b[pos] = a[pos]; -} diff --git a/polly/lib/External/ppcg/tests/dead.c b/polly/lib/External/ppcg/tests/dead.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/dead.c +++ /dev/null @@ -1,23 +0,0 @@ -#include - -int main() -{ - int a[1000], b[1000]; - - for (int i = 0; i < 1000; ++i) - a[i] = i; -#pragma scop - for (int i = 0; i < 1000; ++i) { - int c; - int d; - c = a[i]; - d = c; - b[i] = c; - } -#pragma endscop - for (int i = 0; i < 1000; ++i) - if (b[i] != a[i]) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/iterator.c b/polly/lib/External/ppcg/tests/iterator.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/iterator.c +++ /dev/null @@ -1,18 +0,0 @@ -#include - -int main() -{ - int i; - int a[101]; - - i = 0; -#pragma scop - for (i = 0; i < 100; ++i) - a[i] = i; - a[i] = i; -#pragma endscop - if (a[100] != 100) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/live_out.c b/polly/lib/External/ppcg/tests/live_out.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/live_out.c +++ /dev/null @@ -1,22 +0,0 @@ -#include - -/* Check that a write access is not removed from the live-out - * accesses only because a strict subset of the (potentially) - * accessed elements are killed by a later write. - */ -int main() -{ - int A[10]; - - A[1] = 0; -#pragma scop - int i = 1; - i = i * i; - A[i] = 1; - A[0] = 0; -#pragma endscop - if (A[1] != 1) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/local.c b/polly/lib/External/ppcg/tests/local.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/local.c +++ /dev/null @@ -1,22 +0,0 @@ -#include - -int main() -{ - int A[100]; - -#pragma scop - { - int B[100]; - B[0] = 0; - for (int i = 1; i < 100; ++i) - B[i] = B[i - 1] + 1; - for (int i = 0; i < 100; ++i) - A[i] = B[i]; - } -#pragma endscop - for (int i = 0; i < 100; ++i) - if (A[i] != i) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/loop.c b/polly/lib/External/ppcg/tests/loop.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/loop.c +++ /dev/null @@ -1,18 +0,0 @@ -#include - -int main() -{ - int a[1000], b[1000]; - - for (int i = 0; i < 1000; ++i) - a[i] = i; -#pragma scop - for (int i = 0; i < 1000; ++i) - b[i] = a[i]; -#pragma endscop - for (int i = 0; i < 1000; ++i) - if (b[i] != a[i]) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/not_accessed.c b/polly/lib/External/ppcg/tests/not_accessed.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/not_accessed.c +++ /dev/null @@ -1,29 +0,0 @@ -#include - -void copy_summary(int b[1000], int a[1000], int pos, int c[1000]) -{ - b[pos] = 0; - int d = a[pos]; -} - -#ifdef pencil_access -__attribute__((pencil_access(copy_summary))) -#endif -void copy(int b[1000], int a[1000], int pos, int c[1000]); - -int main() -{ - int a[1000], b[1000], c[1000]; - - for (int i = 0; i < 1000; ++i) - a[i] = i; -#pragma scop - for (int i = 0; i < 1000; ++i) - copy(b, a, i, c); -#pragma endscop - for (int i = 0; i < 1000; ++i) - if (b[i] != a[i]) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl b/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl +++ /dev/null @@ -1,5 +0,0 @@ -void copy(__global int b[1000], __global int a[1000], int pos, - __global int c[1000]) -{ - b[pos] = a[pos]; -} diff --git a/polly/lib/External/ppcg/tests/scalar.c b/polly/lib/External/ppcg/tests/scalar.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/scalar.c +++ /dev/null @@ -1,13 +0,0 @@ -#include - -int main() -{ - int a; -#pragma scop - a = 1; -#pragma endscop - if (a != 1) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/shared_sink.c b/polly/lib/External/ppcg/tests/shared_sink.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/shared_sink.c +++ /dev/null @@ -1,25 +0,0 @@ -#include - -/* Check that the sources of live ranges with the same sink - * are executed in order. - */ -int main() -{ - int A[128]; - int n = 128; - - A[0] = 0; -#pragma scop - for (int i = 0; i < n; ++i) { - int set = 0; - if (A[i] < 2) - set = 1; - if (set) - A[i] = 2; - } -#pragma endscop - if (A[0] != 2) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/struct.c b/polly/lib/External/ppcg/tests/struct.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/struct.c +++ /dev/null @@ -1,31 +0,0 @@ -#include - -struct s { - int c[10][10]; -}; - -int main() -{ - struct s a[10][10], b[10][10]; - - for (int i = 0; i < 10; ++i) - for (int j = 0; j < 10; ++j) - for (int k = 0; k < 10; ++k) - for (int l = 0; l < 10; ++l) - a[i][j].c[k][l] = i + j + k + l; -#pragma scop - for (int i = 0; i < 10; ++i) - for (int j = 0; j < 10; ++j) - for (int k = 0; k < 10; ++k) - for (int l = 0; l < 10; ++l) - b[i][j].c[k][l] = i + j + k + l; -#pragma endscop - for (int i = 0; i < 10; ++i) - for (int j = 0; j < 10; ++j) - for (int k = 0; k < 10; ++k) - for (int l = 0; l < 10; ++l) - if (b[i][j].c[k][l] != a[i][j].c[k][l]) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/struct2.c b/polly/lib/External/ppcg/tests/struct2.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/struct2.c +++ /dev/null @@ -1,21 +0,0 @@ -#include - -struct s { - int a; -}; - -int main() -{ - struct s a, b[10]; - -#pragma scop - a.a = 42; - for (int i = 0; i < 10; ++i) - b[i].a = a.a; -#pragma endscop - for (int i = 0; i < 10; ++i) - if (b[i].a != 42) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/struct3.c b/polly/lib/External/ppcg/tests/struct3.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/struct3.c +++ /dev/null @@ -1,25 +0,0 @@ -#include - -struct s { - int a; - int b; -}; - -int main() -{ - struct s a, b[10]; - - a.b = 57; -#pragma scop - a.a = 42; - for (int i = 0; i < 10; ++i) - b[i] = a; -#pragma endscop - for (int i = 0; i < 10; ++i) - if (b[i].a != 42) - return EXIT_FAILURE; - if (a.b != 57) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/tests/struct4.c b/polly/lib/External/ppcg/tests/struct4.c deleted file mode 100644 --- a/polly/lib/External/ppcg/tests/struct4.c +++ /dev/null @@ -1,27 +0,0 @@ -#include - -struct s { - int a; - int b; -}; - -int main() -{ - int a[10]; - - for (int i = 0; i < 10; ++i) - a[i] = 0; -#pragma scop - for (int i = 0; i < 10; ++i) { - struct s b; - b.a = 1; - b.b = i; - a[i] = b.a + b.b; - } -#pragma endscop - for (int i = 0; i < 10; ++i) - if (a[i] != 1 + i) - return EXIT_FAILURE; - - return EXIT_SUCCESS; -} diff --git a/polly/lib/External/ppcg/util.h b/polly/lib/External/ppcg/util.h deleted file mode 100644 --- a/polly/lib/External/ppcg/util.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef UTIL_H -#define UTIL_H - -#include - -#include -#include - -/* Compare the prefix of "s" to "prefix" up to the length of "prefix". - */ -static inline int prefixcmp(const char *s, const char *prefix) -{ - return strncmp(s, prefix, strlen(prefix)); -} - -__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space, - int val); -__isl_give isl_multi_val *ppcg_multi_val_from_int_list( - __isl_take isl_space *space, int *list); -__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set); - -#endif diff --git a/polly/lib/External/ppcg/util.c b/polly/lib/External/ppcg/util.c deleted file mode 100644 --- a/polly/lib/External/ppcg/util.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2012-2013 Ecole Normale Superieure - * - * Use of this software is governed by the MIT license - * - * Written by Sven Verdoolaege, - * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France - */ - -#include -#include -#include -#include - -#include "util.h" - -/* Construct an isl_multi_val living in "space" with all values equal to "val". - */ -__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space, - int val) -{ - int i, n; - isl_ctx *ctx; - isl_val *v; - isl_multi_val *mv; - - if (!space) - return NULL; - - ctx = isl_space_get_ctx(space); - n = isl_space_dim(space, isl_dim_set); - mv = isl_multi_val_zero(space); - v = isl_val_int_from_si(ctx, val); - for (i = 0; i < n; ++i) - mv = isl_multi_val_set_val(mv, i, isl_val_copy(v)); - isl_val_free(v); - - return mv; -} - -/* Construct an isl_multi_val living in "space" with values specified - * by "list". "list" is assumed to have at least as many entries - * as the set dimension of "space". - */ -__isl_give isl_multi_val *ppcg_multi_val_from_int_list( - __isl_take isl_space *space, int *list) -{ - int i, n; - isl_ctx *ctx; - isl_multi_val *mv; - - if (!space) - return NULL; - - ctx = isl_space_get_ctx(space); - n = isl_space_dim(space, isl_dim_set); - mv = isl_multi_val_zero(space); - for (i = 0; i < n; ++i) { - isl_val *v; - - v = isl_val_int_from_si(ctx, list[i]); - mv = isl_multi_val_set_val(mv, i, v); - } - - return mv; -} - -/* Compute the size of a bounding box around the origin and "set", - * where "set" is assumed to contain only non-negative elements. - * In particular, compute the maximal value of "set" in each direction - * and add one. - */ -__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set) -{ - int i, n; - isl_multi_pw_aff *mpa; - - n = isl_set_dim(set, isl_dim_set); - mpa = isl_multi_pw_aff_zero(isl_set_get_space(set)); - for (i = 0; i < n; ++i) { - isl_space *space; - isl_aff *one; - isl_pw_aff *bound; - - if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) { - const char *name; - name = isl_set_get_tuple_name(set); - if (!name) - name = ""; - fprintf(stderr, "unable to determine extent of '%s' " - "in dimension %d\n", name, i); - set = isl_set_free(set); - } - bound = isl_set_dim_max(isl_set_copy(set), i); - - space = isl_pw_aff_get_domain_space(bound); - one = isl_aff_zero_on_domain(isl_local_space_from_space(space)); - one = isl_aff_add_constant_si(one, 1); - bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one)); - mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound); - } - isl_set_free(set); - - return mpa; -} diff --git a/polly/lib/External/ppcg/version.c b/polly/lib/External/ppcg/version.c deleted file mode 100644 --- a/polly/lib/External/ppcg/version.c +++ /dev/null @@ -1,6 +0,0 @@ -#include "gitversion.h" - -const char *ppcg_version(void) -{ - return GIT_HEAD_ID"\n"; -} diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp --- a/polly/lib/Support/RegisterPasses.cpp +++ b/polly/lib/Support/RegisterPasses.cpp @@ -217,14 +217,6 @@ void initializePollyPasses(llvm::PassRegistry &Registry) { initializeCodeGenerationPass(Registry); -#ifdef GPU_CODEGEN - initializePPCGCodeGenerationPass(Registry); - initializeManagedMemoryRewritePassPass(Registry); - LLVMInitializeNVPTXTarget(); - LLVMInitializeNVPTXTargetInfo(); - LLVMInitializeNVPTXTargetMC(); - LLVMInitializeNVPTXAsmPrinter(); -#endif initializeCodePreparationPass(Registry); initializeDeadCodeElimWrapperPassPass(Registry); initializeDependenceInfoPass(Registry); diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -711,11 +711,6 @@ function_ref GetDeps, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, isl::schedule &LastSchedule, bool &DepsChanged) { - - // Skip SCoPs in case they're already optimised by PPCGCodeGeneration - if (S.isToBeSkipped()) - return; - // Skip empty SCoPs but still allow code generation as it will delete the // loops present but not needed. if (S.getSize() == 0) { diff --git a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll b/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll deleted file mode 100644 --- a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll +++ /dev/null @@ -1,9 +0,0 @@ -define float @__nv_expf(float %a) { - ret float %a -} -define float @__nv_cosf(float %a) { - ret float %a -} -define float @__nv_logf(float %a) { - ret float %a -} diff --git a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll deleted file mode 100644 --- a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Check that we detect a scop. -; SCOP: Function: checkScalarKill -; SCOP-NEXT: Region: %XLoopInit---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; Check that we have a scalar that is not a phi node in the scop. -; SCOP: i32 MemRef_x_0; // Element size 4 - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; Check that we add variables that are local to a scop into the kills that we -; pass to PPCG. This should enable PPCG to codegen this example. -; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) { -; int x; -; #pragma scop -; for(int i = 0; i < 1000; i++) { -; XLoopInit: x = 0; -; -; if (control1 > 2) -; C1Add: x += 10; -; if (control2 > 3) -; C2Add: x += A[i]; -; -; BLoopAccumX: B[i] += x; -; } -; -; #pragma endscop -; } -; ModuleID = 'test.ll' -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %XLoopInit - -XLoopInit: ; preds = %entry.split, %BLoopAccumX - %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ] - %cmp1 = icmp sgt i32 %control1, 2 - %x.0 = select i1 %cmp1, i32 10, i32 0 - %cmp2 = icmp sgt i32 %control2, 3 - br i1 %cmp2, label %C2Add, label %BLoopAccumX - -C2Add: ; preds = %XLoopInit - %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv - %tmp6 = load i32, ptr %arrayidx, align 4 - %add4 = add nsw i32 %tmp6, %x.0 - br label %BLoopAccumX - -BLoopAccumX: ; preds = %XLoopInit, %C2Add - %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ] - %arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv - %tmp11 = load i32, ptr %arrayidx7, align 4 - %add8 = add nsw i32 %tmp11, %x.1 - store i32 %add8, ptr %arrayidx7, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %XLoopInit, label %for.end - -for.end: ; preds = %BLoopAccumX - ret void -} diff --git a/polly/test/GPGPU/align-params-in-schedule.ll b/polly/test/GPGPU/align-params-in-schedule.ll deleted file mode 100644 --- a/polly/test/GPGPU/align-params-in-schedule.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; CHECK: polly_launchKernel - -; Verify that this program compiles. At some point, this compilation crashed -; due to insufficient parameters being available. - -source_filename = "bugpoint-output-4d01492.bc" -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] } -%struct.widget = type { i64, i64, i64 } - -@global = external unnamed_addr global %struct.barney, align 32 - -; Function Attrs: nounwind uwtable -define void @wobble(ptr noalias %arg) #0 { -bb: - %tmp = load i32, ptr %arg, align 4 - br label %bb1 - -bb1: ; preds = %bb13, %bb - %tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ] - br label %bb3 - -bb3: ; preds = %bb3, %bb1 - %tmp4 = load ptr, ptr @global, align 32 - %tmp5 = sext i32 %tmp2 to i64 - %tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8 - %tmp7 = mul i64 %tmp6, %tmp5 - %tmp8 = add i64 %tmp7, 0 - %tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8 - %tmp10 = add i64 %tmp8, %tmp9 - %tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10 - store i32 undef, ptr %tmp11, align 4 - %tmp12 = icmp eq i32 0, 0 - br i1 %tmp12, label %bb13, label %bb3 - -bb13: ; preds = %bb3 - %tmp14 = icmp eq i32 %tmp2, %tmp - %tmp15 = add i32 %tmp2, 1 - br i1 %tmp14, label %bb16, label %bb1 - -bb16: ; preds = %bb13 - ret void -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll b/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll deleted file mode 100644 --- a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-use-llvm-names < %s -; ModuleID = 'test/GPGPU/zero-size-array.ll' - -; REQUIRES: pollyacc - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - - -; We used to divide the element size by 8 to arrive at the 'actual' size -; of an array element. This used to cause arrays that have an element size -; of less than 8 to collapse to size 0. This test makes sure that it does -; not happen anymore. - -; f(int *niters_ptr, int *arr[0]) { -; const int inters = *niters_ptr; -; for(int i = 0; i < niters; i++) { -; arr[0][i + 1] = 0 -; } -; } - -; Function Attrs: nounwind uwtable -define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 { -entry: - %niters = load i32, ptr %niters.ptr, align 4 - br label %loop.body - -loop.body: ; preds = %loop.body, %entry - %indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ] - %indvar.sext = sext i32 %indvar to i64 - %arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext - store i32 0, ptr %arr.slot, align 4 - %tmp8 = icmp eq i32 %indvar, %niters - %indvar.next = add i32 %indvar, 1 - br i1 %tmp8, label %loop.exit, label %loop.body - -loop.exit: ; preds = %loop.body - %tmp10 = icmp sgt i32 undef, 0 - br label %auxiliary.loop - -auxiliary.loop: ; preds = %"101", %loop.exit - %tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ] - br i1 undef, label %auxiliary.loop, label %exit - -exit: ; preds = %auxiliary.loop - ret void -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll b/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll deleted file mode 100644 --- a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-ignore-parameter-bounds \ -; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR -; -; REQUIRES: pollyacc - -; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain -; all the parameters present in the program. -; -; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff` -; to have the same parameter dimensions. To achieve this, we used to realign -; every `pw_aff` with `Scop::Context`. However, in conjunction with -; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context` -; does not contain all parameters. -; -; We check that Polly does the right thing in this case and sets up the parameter -; dimensions correctly. - - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) -; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll' - -; C pseudocode -; ------------ -; void f(int *arr, long niters, long stride) { -; for(int i = 0; i < niters; i++) { -; arr[i * stride] = 1; -; } -; } - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind uwtable -define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 { -entry: - br label %loop - -loop: ; preds = %loop, %entry - %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ] - %idx = mul nuw nsw i64 %indvar, %stride - %slot = getelementptr i32, ptr %arr, i64 %idx - store i32 1, ptr %slot, align 4 - %indvar.next = add nuw nsw i64 %indvar, 1 - %check = icmp sgt i64 %indvar.next, %niters - br i1 %check, label %exit, label %loop - -exit: ; preds = %loop - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind uwtable } diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll deleted file mode 100644 --- a/polly/test/GPGPU/cuda-annotations.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 { - -; KERNEL: !nvvm.annotations = !{!0} - -; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, i64 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb8 - -bb2: ; preds = %bb1 - %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp4 = load i64, ptr %tmp3, align 8 - %tmp5 = add nsw i64 %tmp4, 100 - store i64 %tmp5, ptr %tmp3, align 8 - br label %bb6 - -bb6: ; preds = %bb2 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb8: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll deleted file mode 100644 --- a/polly/test/GPGPU/cuda-managed-memory-simple.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; -; #include -; -; static const int N = 45; -; -; void copy(int *R, int *A) { -; for (int i = 0; i < N; i++) { -; R[i] = A[i] * 10; -; } -; } -; -; int main() { -; int *A, *R; -; -; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal); -; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal); -; -; for (int i = 0; i < N; i++) { -; A[i] = i; -; R[i] = 0; -; } -; copy(R, A); -; -; return 0; -; } -; - -; CHECK-NOT: polly_copyFromHostToDevice -; CHECK-NOT: polly_copyFromDeviceToHost -; CHECK-NOT: polly_freeDeviceMemory -; CHECK-NOT: polly_allocateMemoryForDevice - -; CHECK: %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA() -; CHECK-NEXT: %[[REGCA:[0-9]+]] = bitcast i32* %A to i8* -; CHECK-NEXT: %[[REGCR:[0-9]+]] = bitcast i32* %R to i8* -; CHECK-NEXT: %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 -; CHECK-NEXT: store i8* %[[REGCA]], i8** %polly_launch_0_param_0 -; CHECK-NEXT: %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8* -; CHECK-NEXT: store i8* %[[REGCP0]], i8** %[[REGGEP0]] -; CHECK-NEXT: %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 -; CHECK-NEXT: store i8* %[[REGCR]], i8** %polly_launch_0_param_1 -; CHECK-NEXT: %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8* -; CHECK-NEXT: store i8* %[[REGCP1]], i8** %[[REGGEP1]] -; CHECK-NEXT: %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0)) -; CHECK-NEXT: call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; CHECK-NEXT: call void @polly_freeKernel(i8* %[[REGKERNEL]]) -; CHECK-NEXT: call void @polly_synchronizeDevice() -; CHECK-NEXT: call void @polly_freeContext(i8* %[[REGCTX]]) - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @copy(i32* %R, i32* %A) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] - %exitcond = icmp ne i64 %indvars.iv, 45 - br i1 %exitcond, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %tmp = load i32, i32* %arrayidx, align 4 - %mul = mul nsw i32 %tmp, 10 - %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv - store i32 %mul, i32* %arrayidx2, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - br label %for.cond - -for.end: ; preds = %for.cond - ret void -} - -define i32 @main() { -entry: - %A = alloca i32*, align 8 - %R = alloca i32*, align 8 - %tmp = bitcast i32** %A to i8** - %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2 - %tmp1 = bitcast i32** %R to i8** - %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2 - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] - %exitcond = icmp ne i64 %indvars.iv, 45 - br i1 %exitcond, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %tmp2 = load i32*, i32** %A, align 8 - %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv - %tmp3 = trunc i64 %indvars.iv to i32 - store i32 %tmp3, i32* %arrayidx, align 4 - %tmp4 = load i32*, i32** %R, align 8 - %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv - store i32 0, i32* %arrayidx3, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - br label %for.cond - -for.end: ; preds = %for.cond - %tmp5 = load i32*, i32** %R, align 8 - %tmp6 = load i32*, i32** %A, align 8 - call void @copy(i32* %tmp5, i32* %tmp6) - ret i32 0 -} - -declare i32 @cudaMallocManaged(i8**, i64, i32) #1 diff --git a/polly/test/GPGPU/debug-metadata-leak.ll b/polly/test/GPGPU/debug-metadata-leak.ll deleted file mode 100644 --- a/polly/test/GPGPU/debug-metadata-leak.ll +++ /dev/null @@ -1,104 +0,0 @@ -; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: | FileCheck --check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 { - -; The instruction marked <<>> is copied into the GPUModule, -; with changes only to the parameters to access data on the device instead of -; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the -; instruction is annotated with a DILocation, copying the instruction also copies -; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by -; failing the verification of the Module in GPUNodeBuilder::finalize, due to the -; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied -; nor created. -; -; https://reviews.llvm.org/D35630 removes this debug metadata before the -; instruction is copied to the GPUModule. -; -; vec_add_1.c: -; void vec_add_1(int N, int arr[N]) { -; int i=0; -; for( i=0 ; i>> - store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27 - br label %for.inc, !dbg !25 - -for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31 - call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19 - br label %for.cond, !dbg !32, !llvm.loop !33 - -for.end: ; preds = %for.cond - ret void, !dbg !35 -} - -declare void @llvm.dbg.declare(metadata, metadata, metadata) - -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) - - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) -!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 5.0.0"} -!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) -!8 = !DISubroutineType(types: !9) -!9 = !{null, !10, !11} -!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64) -!12 = !{!13, !14, !15} -!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10) -!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11) -!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10) -!16 = !DIExpression() -!17 = !DILocation(line: 1, column: 20, scope: !7) -!18 = !DILocation(line: 1, column: 27, scope: !7) -!19 = !DILocation(line: 2, column: 7, scope: !7) -!20 = !DILocation(line: 3, column: 8, scope: !21) -!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) -!22 = !DILocation(line: 3, column: 15, scope: !23) -!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3) -!24 = !DILocation(line: 3, column: 3, scope: !21) -!25 = !DILocation(line: 3, column: 25, scope: !23) -!26 = !DILocation(line: 3, column: 32, scope: !23) -!27 = !{!28, !28, i64 0} -!28 = !{!"int", !29, i64 0} -!29 = !{!"omnipotent char", !30, i64 0} -!30 = !{!"Simple C/C++ TBAA"} -!31 = !DILocation(line: 3, column: 21, scope: !23) -!32 = !DILocation(line: 3, column: 3, scope: !23) -!33 = distinct !{!33, !24, !34} -!34 = !DILocation(line: 3, column: 35, scope: !21) -!35 = !DILocation(line: 4, column: 1, scope: !7) diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll deleted file mode 100644 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ /dev/null @@ -1,254 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=SCHED %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-ASM - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today due to extensive output differences from when the test was written. - -; CHECK: Stmt_bb5 -; CHECK-NEXT: Domain := -; CHECK-NEXT: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }; -; CHECK-NEXT: Schedule := -; CHECK-NEXT: { Stmt_bb5[i0, i1] -> [i0, i1] }; -; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] }; -; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] }; - -; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }" -; SCHED-NEXT: child: -; SCHED-NEXT: context: "{ [] }" -; SCHED-NEXT: child: -; SCHED-NEXT: extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: sequence: -; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: set: -; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: guard: "{ [] }" -; SCHED-NEXT: - filter: "{ Stmt_bb5[i0, i1] }" -; SCHED-NEXT: child: -; SCHED-NEXT: guard: "{ [] }" -; SCHED-NEXT: child: -; SCHED-NEXT: mark: "kernel" -; SCHED-NEXT: child: -; SCHED-NEXT: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }" -; SCHED-NEXT: child: -; SCHED-NEXT: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }" -; SCHED-NEXT: child: -; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]" -; SCHED-NEXT: permutable: 1 -; SCHED-NEXT: coincident: [ 1, 1 ] -; SCHED-NEXT: child: -; SCHED-NEXT: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }" -; SCHED-NEXT: child: -; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]" -; SCHED-NEXT: permutable: 1 -; SCHED-NEXT: coincident: [ 1, 1 ] -; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: set: -; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: guard: "{ [] }" - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(16, 32); -; CODE-NEXT: dim3 k0_dimGrid(32, 32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3); - -; IR: polly.split_new_and_old: -; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024) -; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1 -; IR-NEXT: %polly.overflow.state = or i1 false, %.obit -; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0 -; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024) -; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1 -; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1 -; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0 -; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3) -; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1 -; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4 -; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0 -; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6) -; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1 -; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7 -; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0 -; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440 -; IR-NEXT: %5 = and i1 true, %4 -; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true -; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown -; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2 - -; IR: polly.start: -; IR-NEXT: br label %polly.acc.initialize - -; IR: polly.acc.initialize: -; IR-NEXT: [[GPUContext:%.*]] = call ptr @polly_initContext() -; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304) -; IR-NEXT: call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304) -; IR-NEXT: [[DevPtr:%.*]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A) -; IR-NEXT: store ptr [[DevPtr]], ptr %polly_launch_0_param_0 -; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params -; IR-NEXT: call ptr @polly_getKernel -; IR-NEXT: call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr) -; IR-NEXT: call void @polly_freeKernel -; IR-NEXT: call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304) -; IR-NEXT: call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A) -; IR-NEXT: call void @polly_freeContext(ptr [[GPUContext]]) -; IR-NEXT: br label %polly.exiting - -; IR: polly.exiting: -; IR-NEXT: br label %polly.merge_new_and_old - -; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 { -; KERNEL-IR-NEXT: entry: -; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() -; KERNEL-IR-NEXT: %b1 = zext i32 %1 to i64 -; KERNEL-IR-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-IR-NEXT: %t0 = zext i32 %2 to i64 -; KERNEL-IR-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() -; KERNEL-IR-NEXT: %t1 = zext i32 %3 to i64 -; KERNEL-IR-NEXT: br label %polly.loop_preheader - -; KERNEL-IR-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 -; KERNEL-IR-NEXT: ret void - -; KERNEL-IR-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader -; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] -; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0 -; KERNEL-IR-NEXT: %6 = mul nsw i64 32, %b1 -; KERNEL-IR-NEXT: %7 = add nsw i64 %6, %t1 -; KERNEL-IR-NEXT: %8 = mul nsw i64 16, %polly.indvar -; KERNEL-IR-NEXT: %9 = add nsw i64 %7, %8 -; KERNEL-IR-NEXT: br label %polly.stmt.bb5 - -; KERNEL-IR-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header -; KERNEL-IR-NEXT: %10 = mul i64 %5, %9 -; KERNEL-IR-NEXT: %p_tmp6 = sitofp i64 %10 to float -; KERNEL-IR-NEXT: %11 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %12 = add nsw i64 %11, %t0 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024 -; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b1 -; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t1 -; KERNEL-IR-NEXT: %15 = mul nsw i64 16, %polly.indvar -; KERNEL-IR-NEXT: %16 = add nsw i64 %14, %15 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16 -; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A -; KERNEL-IR-NEXT: %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4 -; KERNEL-IR-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 -; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024 -; KERNEL-IR-NEXT: %19 = mul nsw i64 32, %b1 -; KERNEL-IR-NEXT: %20 = add nsw i64 %19, %t1 -; KERNEL-IR-NEXT: %21 = mul nsw i64 16, %polly.indvar -; KERNEL-IR-NEXT: %22 = add nsw i64 %20, %21 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22 -; KERNEL-IR-NEXT: %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3 -; KERNEL-IR-NEXT: store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4 -; KERNEL-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 -; KERNEL-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 0 -; KERNEL-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit - -; KERNEL-IR-LABEL: polly.loop_preheader: ; preds = %entry -; KERNEL-IR-NEXT: br label %polly.loop_header - -; KERNEL-IR: attributes #0 = { "polly.skip.fn" } - -; KERNEL-ASM: .version 3.2 -; KERNEL-ASM-NEXT: .target sm_30 -; KERNEL-ASM-NEXT: .address_size 64 - -; KERNEL-ASM: // .globl kernel_0 - -; KERNEL-ASM: .visible .entry kernel_0( -; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0 -; KERNEL-ASM-NEXT: ) - -; void double_parallel_loop(float A[][1024]) { -; for (long i = 0; i < 1024; i++) -; for (long j = 0; j < 1024; j++) -; A[i][j] += i * j; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @double_parallel_loop(ptr %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb13, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] - %exitcond1 = icmp ne i64 %i.0, 1024 - br i1 %exitcond1, label %bb3, label %bb15 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb10, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %j.0, 1024 - br i1 %exitcond, label %bb5, label %bb12 - -bb5: ; preds = %bb4 - %tmp = mul nuw nsw i64 %i.0, %j.0 - %tmp6 = sitofp i64 %tmp to float - %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0 - %tmp8 = load float, ptr %tmp7, align 4 - %tmp9 = fadd float %tmp8, %tmp6 - store float %tmp9, ptr %tmp7, align 4 - br label %bb10 - -bb10: ; preds = %bb5 - %tmp11 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12 - %tmp14 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb15: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/failing-invariant-load-handling.ll b/polly/test/GPGPU/failing-invariant-load-handling.ll deleted file mode 100644 --- a/polly/test/GPGPU/failing-invariant-load-handling.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS -; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" - -%S = type { i32, i32, [12 x %L] } -%L = type { i32, i32, double, i32, i32, i32, i32, i32 } - -define void @test(ptr %cpi, i1 %b) { -; SCOPS-LABEL: Region: %if.then14---%exit -; SCOPS: Invariant Accesses: { -; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] }; -; SCOPS-NEXT: Execution Context: [l2, l1] -> { : } -; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] }; -; SCOPS-NEXT: Execution Context: [l2, l1] -> { : l2 > 0 } -; SCOPS-NEXT: } -; SCOPS: Arrays { -; SCOPS-NEXT: i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4 -; SCOPS-NEXT: } - -; Check that we gracefully handle failing invariant loads. -; This test case is taken from: -; test/Isl/CodeGen/invariant-load-dimension.ll - -; FIXME: Figure out how to actually generate code for this loop. -; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function - -entry: - %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1 - br i1 %b, label %if.then14, label %exit - -if.then14: - %l0 = load i32, ptr %cpi, align 8 - %cmp12.i = icmp sgt i32 %l0, 0 - br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit - -for.body.lr.ph.i: - %l1 = load i32, ptr %nt, align 4 - br label %for.body.i - -for.body.i: - %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ] - %mul.i163 = mul nsw i32 %phi, %l1 - %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0 - store i32 0, ptr %cv, align 8 - %inc = add nuw nsw i32 %phi, 1 - %l2 = load i32, ptr %cpi, align 8 - %cmp.i164 = icmp slt i32 %inc, %l2 - br i1 %cmp.i164, label %for.body.i, label %exit - -exit: - ret void -} diff --git a/polly/test/GPGPU/failing-invariant-load-hoisting.ll b/polly/test/GPGPU/failing-invariant-load-hoisting.ll deleted file mode 100644 --- a/polly/test/GPGPU/failing-invariant-load-hoisting.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" - -%S = type { i32, i32, [12 x %L] } -%L = type { i32, i32, double, i32, i32, i32, i32, i32 } - -define void @test(ptr %cpi, i1 %b) { -; CODEGEN-LABEL: @test( -; CODEGEN: polly.preload.begin: -; CODEGEN-NEXT: br i1 false - -entry: - %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1 - br i1 %b, label %if.then14, label %exit - -if.then14: - %l0 = load i32, ptr %cpi, align 8 - %cmp12.i = icmp sgt i32 %l0, 0 - br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit - -for.body.lr.ph.i: - %l1 = load i32, ptr %nt, align 4 - br label %for.body.i - -for.body.i: - %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ] - %mul.i163 = mul nsw i32 %phi, %l1 - %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0 - store i32 0, ptr %cv, align 8 - %inc = add nuw nsw i32 %phi, 1 - %l2 = load i32, ptr %cpi, align 8 - %cmp.i164 = icmp slt i32 %inc, %l2 - br i1 %cmp.i164, label %for.body.i, label %exit - -exit: - ret void -} diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll deleted file mode 100644 --- a/polly/test/GPGPU/host-control-flow.ll +++ /dev/null @@ -1,176 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \ -; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \ -; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | FileCheck %s -check-prefix=IR -; void foo(float A[2][100]) { -; for (long t = 0; t < 100; t++) -; for (long i = 1; i < 99; i++) -; A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1]; -; } - -; REQUIRES: pollyacc - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1) -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(4); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, c0); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: } - -; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader -; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] -; ... -; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 -; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* -; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] -; IR: call i8* @polly_getKernel -; ... -; IR: call void @polly_freeKernel -; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 -; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99 -; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0) -; KERNEL-IR-LABEL: entry: -; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-IR-NEXT: %t0 = zext i32 %1 to i64 -; KERNEL-IR-NEXT: br label %polly.cond - -; KERNEL-IR-LABEL: polly.cond: ; preds = %entry -; KERNEL-IR-NEXT: %2 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %3 = add nsw i64 %2, %t0 -; KERNEL-IR-NEXT: %4 = icmp sle i64 %3, 97 -; KERNEL-IR-NEXT: br i1 %4, label %polly.then, label %polly.else - -; KERNEL-IR-LABEL: polly.merge: ; preds = %polly.else, %polly.stmt.for.body3 -; KERNEL-IR-NEXT: ret void - -; KERNEL-IR-LABEL: polly.then: ; preds = %polly.cond -; KERNEL-IR-NEXT: %5 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %6 = add nsw i64 %5, %t0 -; KERNEL-IR-NEXT: br label %polly.stmt.for.body3 - -; KERNEL-IR-LABEL: polly.stmt.for.body3: ; preds = %polly.then -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %pexp.pdiv_r = urem i64 %c0, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100 -; KERNEL-IR-NEXT: %7 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %8 = add nsw i64 %7, %t0 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8 -; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A -; KERNEL-IR-NEXT: %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4 -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %pexp.pdiv_r2 = urem i64 %c0, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100 -; KERNEL-IR-NEXT: %9 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %10 = add nsw i64 %9, %t0 -; KERNEL-IR-NEXT: %11 = add nsw i64 %10, 1 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11 -; KERNEL-IR-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4 -; KERNEL-IR-NEXT: %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4 -; KERNEL-IR-NEXT: %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_ -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %pexp.pdiv_r7 = urem i64 %c0, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100 -; KERNEL-IR-NEXT: %12 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %13 = add nsw i64 %12, %t0 -; KERNEL-IR-NEXT: %14 = add nsw i64 %13, 2 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14 -; KERNEL-IR-NEXT: %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9 -; KERNEL-IR-NEXT: %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4 -; KERNEL-IR-NEXT: %p_add12 = fadd float %p_add, %tmp3_p_scalar_ -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %15 = add nsw i64 %c0, 1 -; KERNEL-IR-NEXT: %pexp.pdiv_r12 = urem i64 %15, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100 -; KERNEL-IR-NEXT: %16 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %17 = add nsw i64 %16, %t0 -; KERNEL-IR-NEXT: %18 = add nsw i64 %17, 1 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18 -; KERNEL-IR-NEXT: %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14 -; KERNEL-IR-NEXT: %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4 -; KERNEL-IR-NEXT: %p_add17 = fadd float %tmp4_p_scalar_, %p_add12 -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %19 = add nsw i64 %c0, 1 -; KERNEL-IR-NEXT: %pexp.pdiv_r17 = urem i64 %19, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100 -; KERNEL-IR-NEXT: %20 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %21 = add nsw i64 %20, %t0 -; KERNEL-IR-NEXT: %22 = add nsw i64 %21, 1 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22 -; KERNEL-IR-NEXT: %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19 -; KERNEL-IR-NEXT: store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4 -; KERNEL-IR-NEXT: br label %polly.merge - -; KERNEL-IR-LABEL: polly.else: ; preds = %polly.cond -; KERNEL-IR-NEXT: br label %polly.merge -; KERNEL-IR-NEXT: } - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo([100 x float]* %A) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc18, %entry - %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ] - %exitcond1 = icmp ne i64 %t.0, 100 - br i1 %exitcond1, label %for.body, label %for.end20 - -for.body: ; preds = %for.cond - br label %for.cond1 - -for.cond1: ; preds = %for.inc, %for.body - %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ] - %exitcond = icmp ne i64 %i.0, 99 - br i1 %exitcond, label %for.body3, label %for.end - -for.body3: ; preds = %for.cond1 - %sub = add nsw i64 %i.0, -1 - %rem = srem i64 %t.0, 2 - %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub - %tmp = load float, float* %arrayidx4, align 4 - %rem5 = srem i64 %t.0, 2 - %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0 - %tmp2 = load float, float* %arrayidx7, align 4 - %add = fadd float %tmp, %tmp2 - %add8 = add nuw nsw i64 %i.0, 1 - %rem9 = srem i64 %t.0, 2 - %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8 - %tmp3 = load float, float* %arrayidx11, align 4 - %add12 = fadd float %add, %tmp3 - %add13 = add nuw nsw i64 %t.0, 1 - %rem14 = srem i64 %add13, 2 - %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0 - %tmp4 = load float, float* %arrayidx16, align 4 - %add17 = fadd float %tmp4, %add12 - store float %add17, float* %arrayidx16, align 4 - br label %for.inc - -for.inc: ; preds = %for.body3 - %inc = add nuw nsw i64 %i.0, 1 - br label %for.cond1 - -for.end: ; preds = %for.cond1 - br label %for.inc18 - -for.inc18: ; preds = %for.end - %inc19 = add nuw nsw i64 %t.0, 1 - br label %for.cond - -for.end20: ; preds = %for.cond - ret void -} diff --git a/polly/test/GPGPU/host-statement.ll b/polly/test/GPGPU/host-statement.ll deleted file mode 100644 --- a/polly/test/GPGPU/host-statement.ll +++ /dev/null @@ -1,204 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-invariant-load-hoisting=false \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -polly-invariant-load-hoisting=false \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -declare void @llvm.lifetime.start(i64, ptr nocapture) #0 - -; This test case tests that we can correctly handle a ScopStmt that is -; scheduled on the host, instead of within a kernel. - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(16); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: if (p_0 <= 510 && p_1 <= 510) { -; CODE-NEXT: { -; CODE-NEXT: dim3 k1_dimBlock(32); -; CODE-NEXT: dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16); -; CODE-NEXT: kernel1 <<>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k2_dimBlock(16, 32); -; CODE-NEXT: dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16); -; CODE-NEXT: kernel2 <<>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: } -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: Stmt_for_cond33_preheader_last(); - -; CODE: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_for_body16(32 * b0 + t0); - -; CODE: # kernel1 -; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1) -; CODE-NEXT: for (int c1 = 0; c1 <= 15; c1 += 1) { -; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0) -; CODE-NEXT: Stmt_for_body35(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510) -; CODE-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) -; CODE-NEXT: Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3); -; CODE-NEXT: sync0(); -; CODE-NEXT: } - -; CODE: # kernel2 -; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1) -; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510) -; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; CODE-NEXT: Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3); - -; KERNEL-IR: call void @llvm.nvvm.barrier0() - -; Function Attrs: nounwind uwtable -define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry.split, %for.inc86 - %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ] - %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ] - br label %for.inc - -for.inc: ; preds = %for.cond1.preheader, %for.inc - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ] - %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ] - %arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24 - %tmp = load double, ptr %arrayidx5, align 8, !tbaa !1 - %arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24 - %tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1 - %mul = fmul double %tmp, %tmp27 - %add = fadd double %nrm.02, %mul - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 512 - br i1 %exitcond, label %for.inc, label %for.end - -for.end: ; preds = %for.inc - %add.lcssa = phi double [ %add, %for.inc ] - %call = tail call double @sqrt(double %add.lcssa) #2 - %arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24 - store double %call, ptr %arrayidx13, align 8, !tbaa !1 - br label %for.body16 - -for.cond33.preheader: ; preds = %for.body16 - %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1 - %cmp347 = icmp slt i64 %indvars.iv.next25, 512 - br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86 - -for.body35.lr.ph: ; preds = %for.cond33.preheader - br label %for.body35 - -for.body16: ; preds = %for.end, %for.body16 - %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ] - %arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24 - %tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1 - %arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24 - %tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1 - %div = fdiv double %tmp28, %tmp29 - %arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24 - store double %div, ptr %arrayidx28, align 8, !tbaa !1 - %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 - %exitcond12 = icmp ne i64 %indvars.iv.next11, 512 - br i1 %exitcond12, label %for.body16, label %for.cond33.preheader - -for.cond33.loopexit: ; preds = %for.body62 - %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32 - %exitcond23 = icmp ne i32 %lftr.wideiv, 512 - br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge - -for.body35: ; preds = %for.body35.lr.ph, %for.cond33.loopexit - %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ] - %arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 - store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1 - br label %for.body42 - -for.cond60.preheader: ; preds = %for.body42 - br label %for.body62 - -for.body42: ; preds = %for.body35, %for.body42 - %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ] - %arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24 - %tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1 - %arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21 - %tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1 - %mul51 = fmul double %tmp30, %tmp31 - %arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 - %tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1 - %add56 = fadd double %tmp32, %mul51 - store double %add56, ptr %arrayidx55, align 8, !tbaa !1 - %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 - %exitcond15 = icmp ne i64 %indvars.iv.next14, 512 - br i1 %exitcond15, label %for.body42, label %for.cond60.preheader - -for.body62: ; preds = %for.cond60.preheader, %for.body62 - %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ] - %arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21 - %tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1 - %arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24 - %tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1 - %arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 - %tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1 - %mul75 = fmul double %tmp34, %tmp35 - %sub = fsub double %tmp33, %mul75 - %arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21 - store double %sub, ptr %arrayidx79, align 8, !tbaa !1 - %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 - %exitcond18 = icmp ne i64 %indvars.iv.next17, 512 - br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit - -for.cond33.for.inc86_crit_edge: ; preds = %for.cond33.loopexit - br label %for.inc86 - -for.inc86: ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond26 = icmp ne i64 %indvars.iv.next25, 512 - br i1 %exitcond26, label %for.cond1.preheader, label %for.end88 - -for.end88: ; preds = %for.inc86 - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #0 - -; Function Attrs: nounwind -declare double @sqrt(double) #2 - -attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - -!llvm.ident = !{!0} - -!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"double", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/ignore-parameter-bounds.ll b/polly/test/GPGPU/ignore-parameter-bounds.ll deleted file mode 100644 --- a/polly/test/GPGPU/ignore-parameter-bounds.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; CODE: Code -; CODE: ==== -; CODE: No code generated - -source_filename = "bugpoint-output-83bcdeb.bc" -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -@__data_radiation_MOD_cobi = external global [168 x double], align 32 - -; Function Attrs: nounwind uwtable -define void @__radiation_rg_MOD_coe_so() #0 { -entry: - %polly.access.kspec.load = load i32, ptr undef, align 4 - %0 = or i1 undef, undef - br label %polly.preload.cond29 - -polly.preload.cond29: ; preds = %entry - br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30 - -polly.preload.merge30: ; preds = %polly.preload.exec31, %polly.preload.cond29 - %polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ] - ret void - -polly.preload.exec31: ; preds = %polly.preload.cond29 - %1 = sext i32 %polly.access.kspec.load to i64 - %2 = mul nsw i64 7, %1 - %3 = add nsw i64 0, %2 - %4 = add nsw i64 %3, 48 - %polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4 - %polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8 - br label %polly.preload.merge30 -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll deleted file mode 100644 --- a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR -; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR - -; Test that we do recognise and codegen a kernel that has intrinsics. - -; REQUIRES: pollyacc - -; Check that we model the kernel as a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end - -; Check that the intrinsic call is present in the kernel IR. -; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) -; KERNEL-IR: declare float @llvm.sqrt.f32(float) -; KERNEL-IR: declare float @llvm.fabs.f32(float) - - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) - - -; void f(float *A, float *B, int N) { -; for(int i = 0; i < N; i++) { -; float tmp0 = A[i]; -; float tmp1 = sqrt(tmp1); -; float tmp2 = fabs(tmp2); -; float tmp3 = copysignf(tmp1, tmp2); -; B[i] = tmp4; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(float* %A, float* %B, i32 %N) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp1 = icmp sgt i32 %N, 0 - br i1 %cmp1, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv - %A.arr.i.val = load float, float* %A.arr.i, align 4 - ; Call to intrinsics that should be part of the kernel. - %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) - %fabs = tail call float @llvm.fabs.f32(float %sqrt); - %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs); - %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %copysign, float* %B.arr.i, align 4 - - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %wide.trip.count = zext i32 %N to i64 - %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - -; Function Attrs: nounwind readnone -declare float @llvm.sqrt.f32(float) #0 -declare float @llvm.fabs.f32(float) #0 -declare float @llvm.copysign.f32(float, float) #0 - -attributes #0 = { nounwind readnone } - diff --git a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll deleted file mode 100644 --- a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ -; RUN: -disable-output < %s - -; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually -; fail on an illegal module. - -; REQUIRES: pollyacc, asserts -; XFAIL: * -; -; void foo(long A[1024], long B[1024]) { -; for (long i = 0; i < 1024; i++) -; A[i] += (B[i] + (long)&B[i]); -; } - - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb1 - -bb1: ; preds = %bb10, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb12 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp3 = load i64, ptr %tmp, align 8 - %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp5 = ptrtoint ptr %tmp4 to i64 - %tmp6 = add nsw i64 %tmp3, %tmp5 - %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp8 = load i64, ptr %tmp7, align 8 - %tmp9 = add nsw i64 %tmp8, %tmp6 - store i64 %tmp9, ptr %tmp7, align 8 - br label %bb10 - -bb10: ; preds = %bb2 - %tmp11 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb12: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/invalid-kernel.ll b/polly/test/GPGPU/invalid-kernel.ll deleted file mode 100644 --- a/polly/test/GPGPU/invalid-kernel.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: not FileCheck %s -check-prefix=KERNEL-IR - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; REQUIRES: pollyacc -; -; void foo(long A[1024], long B[1024]) { -; for (long i = 0; i < 1024; i++) -; A[i] += (B[i] + (long)&B[i]); -; } - -; This kernel loads/stores a pointer address we model. This is a rare case, -; were we still lack proper code-generation support. We check here that we -; detect the invalid IR and bail out gracefully. - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_B, dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; KERNEL-IR: kernel - -; IR: br i1 false, label %polly.start, label %bb1 - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb1 - -bb1: ; preds = %bb10, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb12 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp3 = load i64, ptr %tmp, align 8 - %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp5 = ptrtoint ptr %tmp4 to i64 - %tmp6 = add nsw i64 %tmp3, %tmp5 - %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp8 = load i64, ptr %tmp7, align 8 - %tmp9 = add nsw i64 %tmp8, %tmp6 - store i64 %tmp9, ptr %tmp7, align 8 - br label %bb10 - -bb10: ; preds = %bb2 - %tmp11 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb12: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/invariant-load-array-access.ll b/polly/test/GPGPU/invariant-load-array-access.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-array-access.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - - -; REQUIRES: pollyacc - -; Check that we detect a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] }; -; SCOP-NEXT: Execution Context: [tmp] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] }; -; SCOP-NEXT: Execution Context: [tmp] -> { : tmp >= 4 } -; SCOP-NEXT: } - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; This test makes sure that such an access pattern is handled correctly -; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads` -; was the main reason that caused this test case to crash. -; -; void f(int *arr, const int *control, const int *readarr) { -; for(int i = 0; i < 1000; i++) { -; int t = 0; -; if (*control > 3) { -; t += *readarr; -; } -; arr[i] = t; -; } -; } - - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.12.0" -define void @f(ptr %arr, ptr %control, ptr %readarr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %if.end - %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ] - %tmp = load i32, ptr %control, align 4 - %cmp1 = icmp sgt i32 %tmp, 3 - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %for.body - %tmp1 = load i32, ptr %readarr, align 4 - br label %if.end - -if.end: ; preds = %if.then, %for.body - %t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01 - store i32 %t.0, ptr %arrayidx, align 4 - %inc = add nuw nsw i32 %i.01, 1 - %exitcond = icmp eq i32 %inc, 1000 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %if.end - ret void -} diff --git a/polly/test/GPGPU/invariant-load-escaping-values.ll b/polly/test/GPGPU/invariant-load-escaping-values.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-escaping-values.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s - -; REQUIRES: pollyacc - -; CHECK: store i64 %polly.access.B.load, ptr %invariant.preload.s2a -; CHECK: %invariant.final_reload = load i64, ptr %invariant.preload.s2a - -; Verify that the final reload of an invariant scalar memory access uses the -; same stack slot that into which the invariant memory access was stored -; originally. Earlier, this was broken as we introduce a new stack slot aside -; of the preload stack slot, which remained uninitialized and caused our escaping -; loads to contain garbage. - -define i64 @foo(ptr %A, ptr %B) { -entry: - br label %loop - -loop: - %indvar = phi i64 [0, %entry], [%indvar.next, %loop] - %indvar.next = add nsw i64 %indvar, 1 - %idx = getelementptr float, ptr %A, i64 %indvar - store float 42.0, ptr %idx - %invariant = load i64, ptr %B - %cmp = icmp sle i64 %indvar, 1024 - br i1 %cmp, label %loop, label %exit - -exit: - ret i64 %invariant -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll b/polly/test/GPGPU/invariant-load-hoisting-of-array.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Entry: Contains (%loaded.ptr.preload.s2a = alloca double*) which is -; | invariant load hoisted `%loaded.ptr` -; v -; Run-time check --(failure branch)--> { old code - contains `%loaded.ptr` } -; | -; (success branch) -; | -; v -; New Code: Should refer to `%loaded.ptr.preload.s2a`, which is -; the invariant load hoisted value, NOT `%loaded.ptr`. - -; In Polly, we preserve the old code and create a separate branch that executes -; the GPU code if a run-time check succeeds. - -; We need to make sure that in the new branch, we pick up invariant load hoisted -; values. The old values will belong to the old code branch. - -; In this case, we use to try to load the 'original' %loaded.ptr in the -; 'New Code' branch,which is wrong. Check that this does not happen. - -; Check that we have a Scop with an invariant load of the array. -; SCOP: Function: f -; SCOP-NEXT: Region: %arrload---%for.exit -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: { Stmt_arrload[] -> MemRef_arr_of_ptrs[0] }; - - - -; Check that we have the preloaded array. -; HOST-IR: entry: -; HOST-IR-NEXT: %loaded.ptr.preload.s2a = alloca double* - -; Chek that we store the correct value in the preload. -; polly.preload.begin: ; preds = %polly.split_new_and_old -; HOST-IR: %polly.access.arr.of.ptrs = getelementptr double*, double** %arr.of.ptrs, i64 0 -; HOST-IR-NEXT: %polly.access.arr.of.ptrs.load = load double*, double** %polly.access.arr.of.ptrs -; HOST-IR-NEXT: store double* %polly.access.arr.of.ptrs.load, double** %loaded.ptr.preload.s2a - -; Check that we get back data from the kernel. -; HOST-IR: polly.acc.initialize: ; preds = %polly.start -; HOST-IR: [[FIRSTINDEX:%.+]] = getelementptr double, double* %polly.access.arr.of.ptrs.load, i64 1 -; HOST-IR: [[BITCASTED:%.+]] = bitcast double* [[FIRSTINDEX]] to i8* -; HOST-IR: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_loaded_ptr, i8* [[BITCASTED]], i64 800) - -; Check that the kernel launch is generated in the host IR. -; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) - - -; C pseudocode equivalent -; void f(double **arr_of_ptrs) { -; double *loaded_ptr = arr_of_ptrs[0]; -; if (false) { return; } -; else { -; for(int i = 1; i < 100; i++) { -; loaded_ptr[i] = 42.0; -; } -; } -; } - - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - - -; Function Attrs: nounwind uwtable -define void @f(double **%arr.of.ptrs) #0 { -entry: - br label %arrload - -arrload: ; preds = %"7" - %loaded.ptr = load double*, double** %arr.of.ptrs, align 8 - br i1 false, label %"for.exit", label %"for.preheader" - -"for.preheader": ; preds = %"51" - br label %"for.body" - -"for.body": ; preds = %"53", %"53.lr.ph" - %indvar = phi i64 [ 1, %"for.preheader" ], [ %indvar.next, %"for.body" ] - %slot = getelementptr double, double* %loaded.ptr, i64 %indvar - store double 42.0, double* %slot, align 8 - - %indvar.next = add nuw nsw i64 %indvar, 1 - - %check = icmp sgt i64 %indvar.next, 100 - br i1 %check, label %"for.exit", label %"for.body" - -"for.exit": ; preds = %"52.54_crit_edge", %"51" - ret void -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll b/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=HOST-IR %s - -; RUN: opt %loadPolly -disable-output -polly-acc-dump-kernel-ir \ -; RUN: -polly-codegen-ppcg -polly-scops \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; Verify that invariant loads used in a kernel statement are correctly forwarded -; as subtree value to the GPU kernel. - -; HOST-IR: store float %polly.access.p.load, ptr %invariant.preload.s2a, align 4 - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2({{.*}}ptr addrspace(1) %MemRef_indvar2f__phi{{.*}}) -; KERNEL-IR: %indvar2f.phiops.reload = load float, ptr %indvar2f.phiops, align 4 -; KERNEL-IR: store float %indvar2f.phiops.reload, ptr addrspace(1) %polly.access.MemRef_A, align 4 - -; FIXME: store float %indvar2f.phiops.reload, ptr %indvar2f.phiops, align 4 -; For some reason the above instruction is emitted that stores back to the addess it was just loaded from. - -define void @foo(ptr %A, ptr %p) { -entry: - br label %loop - -loop: - %indvar = phi i64 [0, %entry], [%indvar.next, %loop] - %indvar.next = add i64 %indvar, 1 - %invariant = load float, ptr %p - %ptr = getelementptr float, ptr %A, i64 %indvar - store float 42.0, ptr %ptr - %cmp = icmp sle i64 %indvar, 1024 - br i1 %cmp, label %loop, label %anotherloop - -anotherloop: - %indvar2 = phi i64 [0, %loop], [%indvar2.next, %anotherloop] - %indvar2f = phi float [%invariant, %loop], [%indvar2f, %anotherloop] - %indvar2.next = add i64 %indvar2, 1 - store float %indvar2f, ptr %A - %cmp2 = icmp sle i64 %indvar2, 1024 - br i1 %cmp2, label %anotherloop, label %end - -end: - ret void - -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_entry_split[] -> MemRef_begin[0] }; -; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_for_body[i0] -> MemRef_end[0] }; -; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } -; SCOP-NEXT: } - - -; Check that the kernel launch is generated in the host IR. -; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; void f(int *begin, int *end, int *arr) { -; for (int i = *begin; i < *end; i++) { -; arr[i] = 0; -; } -; } -; - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" - -define void @f(ptr %begin, ptr %end, ptr %arr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp1 = load i32, ptr %begin, align 4 - %tmp41 = load i32, ptr %end, align 4 - %cmp2 = icmp slt i32 %tmp1, %tmp41 - br i1 %cmp2, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %i.03 = phi i32 [ %tmp1, %for.body.lr.ph ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.03 - store i32 0, ptr %arrayidx, align 4 - %inc = add nsw i32 %i.03, 1 - %tmp4 = load i32, ptr %end, align 4 - %cmp = icmp slt i32 %inc, %tmp4 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Check that we detect a scop with invariant accesses. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [beginval] -> { Stmt_entry_split[] -> MemRef_begin[0] }; -; SCOP-NEXT: Execution Context: [beginval] -> { : } -; SCOP-NEXT: } - -; Check that the kernel launch is generated in the host IR. -; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; -; void f(int *begin, int *arr) { -; for (int i = *begin; i < 100; i++) { -; arr[i] = 0; -; } -; } - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" - -define void @f(ptr %begin, ptr %arr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %beginval = load i32, ptr %begin, align 4 - %cmp1 = icmp slt i32 %beginval, 100 - br i1 %cmp1, label %for.body, label %for.end - - - -for.body: ; preds = %for.body.lr.ph, %for.body - %ival = phi i32 [ %beginval, %entry.split ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %ival - store i32 0, ptr %arrayidx, align 4 - %inc = add nsw i32 %ival, 1 - %cmp = icmp slt i32 %ival, 99 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Check that we detect a scop with invariant accesses. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp2] -> { Stmt_for_body[i0] -> MemRef_idx[0] }; -; SCOP-NEXT: Execution Context: [tmp2] -> { : } -; SCOP-NEXT: } - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; Check if we generate GPU code for simple loop with variable upper bound. -; This always worked, but have this test to prevent regressions. -; void f(int *idx, int *arr) { -; for (int i = 0; i < *idx; i++) { -; arr[i] = 0; -; } -; } -; -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(ptr %idx, ptr %arr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp21 = load i32, ptr %idx, align 4 - %cmp2 = icmp sgt i32 %tmp21, 0 - br i1 %cmp2, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv - store i32 0, ptr %arrayidx, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %tmp2 = load i32, ptr %idx, align 4 - %0 = sext i32 %tmp2 to i64 - %cmp = icmp slt i64 %indvars.iv.next, %0 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-hoisting.ll b/polly/test/GPGPU/invariant-load-hoisting.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-hoisting.ll +++ /dev/null @@ -1,116 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; -; RUN: opt %loadPolly -polly-scops -S -polly-invariant-load-hoisting \ -; RUN: -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR -; -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=KERNEL-IR -; -; REQUIRES: pollyacc -; -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end26 -; SCOP-NEXT: Max Loop Depth: 3 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] }; -; SCOP-NEXT: Execution Context: [n, tmp12] -> { : n > 0 } -; SCOP-NEXT: } -; HOST-IR: call void @polly_launchKernel(ptr %[[REGC:[0-9]+]], i32 %{{[0-9]+}}, i32 1, i32 32, i32 1, i32 1, ptr %polly_launch_0_params_i8ptr) -; HOST-IR-NEXT: call void @polly_freeKernel(ptr %[[REGC]]) - -; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_B, ptr addrspace(1) %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge) - - -; Check that we generate correct GPU code in case of invariant load hoisting. -; -; -; static const int N = 3000; -; -; void f(int A[N][N], int *invariant, int B[N][N], int n) { -; for (int i = 0; i < n; i++) { -; for (int j = 0; j < n; j++) { -; for (int k = 0; k < n; k++) { -; -; A[*invariant][k] = B[k][k]; -; A[k][*invariant] += B[k][k]; -; } -; } -; } -; } -; - -define void @f(ptr %A, ptr %invariant, ptr %B, i32 %n) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp6 = icmp sgt i32 %n, 0 - br i1 %cmp6, label %for.cond1.preheader.lr.ph, label %for.end26 - -for.cond1.preheader.lr.ph: ; preds = %entry.split - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc24 - %i.07 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc25, %for.inc24 ] - %cmp23 = icmp sgt i32 %n, 0 - br i1 %cmp23, label %for.cond4.preheader.lr.ph, label %for.inc24 - -for.cond4.preheader.lr.ph: ; preds = %for.cond1.preheader - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %for.cond4.preheader.lr.ph, %for.inc21 - %j.04 = phi i32 [ 0, %for.cond4.preheader.lr.ph ], [ %inc22, %for.inc21 ] - %cmp51 = icmp sgt i32 %n, 0 - br i1 %cmp51, label %for.body6.lr.ph, label %for.inc21 - -for.body6.lr.ph: ; preds = %for.cond4.preheader - br label %for.body6 - -for.body6: ; preds = %for.body6.lr.ph, %for.body6 - %k.02 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ] - %idxprom = sext i32 %k.02 to i64 - %idxprom7 = sext i32 %k.02 to i64 - %arrayidx8 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom, i64 %idxprom7 - %tmp9 = load i32, ptr %arrayidx8, align 4 - %tmp12 = load i32, ptr %invariant, align 4 - %idxprom9 = sext i32 %tmp12 to i64 - %idxprom11 = sext i32 %k.02 to i64 - %arrayidx12 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom9, i64 %idxprom11 - store i32 %tmp9, ptr %arrayidx12, align 4 - %idxprom13 = sext i32 %k.02 to i64 - %idxprom15 = sext i32 %k.02 to i64 - %arrayidx16 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom13, i64 %idxprom15 - %tmp17 = load i32, ptr %arrayidx16, align 4 - %idxprom17 = sext i32 %k.02 to i64 - %tmp21 = load i32, ptr %invariant, align 4 - %idxprom19 = sext i32 %tmp21 to i64 - %arrayidx20 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom17, i64 %idxprom19 - %tmp22 = load i32, ptr %arrayidx20, align 4 - %add = add nsw i32 %tmp22, %tmp17 - store i32 %add, ptr %arrayidx20, align 4 - %inc = add nuw nsw i32 %k.02, 1 - %cmp5 = icmp slt i32 %inc, %n - br i1 %cmp5, label %for.body6, label %for.cond4.for.inc21_crit_edge - -for.cond4.for.inc21_crit_edge: ; preds = %for.body6 - br label %for.inc21 - -for.inc21: ; preds = %for.cond4.for.inc21_crit_edge, %for.cond4.preheader - %inc22 = add nuw nsw i32 %j.04, 1 - %cmp2 = icmp slt i32 %inc22, %n - br i1 %cmp2, label %for.cond4.preheader, label %for.cond1.for.inc24_crit_edge - -for.cond1.for.inc24_crit_edge: ; preds = %for.inc21 - br label %for.inc24 - -for.inc24: ; preds = %for.cond1.for.inc24_crit_edge, %for.cond1.preheader - %inc25 = add nuw nsw i32 %i.07, 1 - %cmp = icmp slt i32 %inc25, %n - br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end26_crit_edge - -for.cond.for.end26_crit_edge: ; preds = %for.inc24 - br label %for.end26 - -for.end26: ; preds = %for.cond.for.end26_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-of-scalar.ll b/polly/test/GPGPU/invariant-load-of-scalar.ll deleted file mode 100644 --- a/polly/test/GPGPU/invariant-load-of-scalar.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck -check-prefix=SCOP %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=HOST-IR %s - - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; Check that we offload invariant loads of scalars correctly. - -; Check that invariant loads are present. -; SCOP: Function: checkPrivatization -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp2] -> { Stmt_entry_split[] -> MemRef_begin[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp2] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp2] -> { Stmt_for_body[i0] -> MemRef_end[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp2] -> { : } -; SCOP-NEXT: } -; - -; Check that we do not actually allocate arrays for %begin, %end, since they are -; invariant load hoisted. -; HOST-IR: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice -; HOST-IR-NOT: call ptr @polly_allocateMemoryForDevice - -; Check that we send the invariant loaded scalars as parameters to the -; kernel function. -; KERNEL-IR: define ptx_kernel void @FUNC_checkPrivatization_SCOP_0_KERNEL_0 -; KERNEL-IR-SAME: (ptr addrspace(1) %MemRef_A, i32 %tmp, -; KERNEL-IR-SAME: i32 %tmp2, i32 %polly.access.begin.load) - - -; void checkScalarPointerOffload(int A[], int *begin, int *end) { -; for(int i = *begin; i < *end; i++) { -; A[i] = 10; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -define void @checkPrivatization(ptr %A, ptr %begin, ptr %end) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp = load i32, ptr %begin, align 4 - %tmp21 = load i32, ptr %end, align 4 - %cmp3 = icmp slt i32 %tmp, %tmp21 - br i1 %cmp3, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - %tmp1 = sext i32 %tmp to i64 - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv4 = phi i64 [ %tmp1, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv4 - store i32 10, ptr %arrayidx, align 4 - %indvars.iv.next = add i64 %indvars.iv4, 1 - %tmp2 = load i32, ptr %end, align 4 - %tmp3 = sext i32 %tmp2 to i64 - %cmp = icmp slt i64 %indvars.iv.next, %tmp3 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll deleted file mode 100644 --- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll +++ /dev/null @@ -1,106 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; REQUIRES: pollyacc -; -; void kernel_params_only_some_arrays(float A[], float B[]) { -; for (long i = 0; i < 32; i++) -; A[i] += 42; -; -; for (long i = 0; i < 32; i++) -; B[i] += 42; -; } - -; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0' -; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0" -; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" - -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B) -; KERNEL-NEXT: entry: -; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-NEXT: %t0 = zext i32 %1 to i64 - -; KERNEL: ret void -; KERNEL-NEXT: } - -; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1' -; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1" -; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" - -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A) -; KERNEL-NEXT: entry: -; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-NEXT: %t0 = zext i32 %1 to i64 - -; KERNEL: ret void -; KERNEL-NEXT: } - - -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 -; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 -; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* -; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] - -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 -; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 -; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* -; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @kernel_params_only_some_arrays(float* %A, float* %B) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp = load float, float* %arrayidx, align 4 - %add = fadd float %tmp, 4.200000e+01 - store float %add, float* %arrayidx, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %inc = add nuw nsw i64 %i.0, 1 - br label %for.cond - -for.end: ; preds = %for.cond - br label %for.cond2 - -for.cond2: ; preds = %for.inc7, %for.end - %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ] - %exitcond = icmp ne i64 %i1.0, 32 - br i1 %exitcond, label %for.body4, label %for.end9 - -for.body4: ; preds = %for.cond2 - %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0 - %tmp2 = load float, float* %arrayidx5, align 4 - %add6 = fadd float %tmp2, 4.200000e+01 - store float %add6, float* %arrayidx5, align 4 - br label %for.inc7 - -for.inc7: ; preds = %for.body4 - %inc8 = add nuw nsw i64 %i1.0, 1 - br label %for.cond2 - -for.end9: ; preds = %for.cond2 - ret void -} diff --git a/polly/test/GPGPU/kernel-params-scop-parameter.ll b/polly/test/GPGPU/kernel-params-scop-parameter.ll deleted file mode 100644 --- a/polly/test/GPGPU/kernel-params-scop-parameter.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; void kernel_params_scop_parameter(float A[], long n) { -; for (long i = 0; i < n; i++) -; A[i] += 42; -; } - -; KERNEL-IR: define ptx_kernel void @FUNC_kernel_params_scop_parameter_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @kernel_params_scop_parameter(ptr %A, i64 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb8 - -bb2: ; preds = %bb1 - %tmp3 = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp4 = load float, ptr %tmp3, align 4 - %tmp5 = fadd float %tmp4, 4.200000e+01 - store float %tmp5, ptr %tmp3, align 4 - br label %bb6 - -bb6: ; preds = %bb2 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb8: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll deleted file mode 100644 --- a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll +++ /dev/null @@ -1,124 +0,0 @@ -; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-kernel-ir -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { -; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_1_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { -; KERNEL: define ptx_kernel void @FUNC_foo2_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; Function Attrs: nounwind uwtable -define void @foo(i32 %arg, ptr %arg1) #0 { -bb: - br label %bb2 - -bb2: ; preds = %bb - %tmp = icmp sgt i32 %arg, 0 - br i1 %tmp, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb4, %bb3 - %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ] - %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5 - %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2 - %tmp8 = add nsw i32 %tmp7, 1 - store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2 - %tmp9 = add nuw nsw i64 %tmp5, 1 - %tmp10 = zext i32 %arg to i64 - %tmp11 = icmp ne i64 %tmp9, %tmp10 - br i1 %tmp11, label %bb4, label %bb12 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12, %bb2 - %tmp14 = tail call i64 @clock() #3 - %tmp15 = icmp eq i64 %tmp14, 0 - br i1 %tmp15, label %bb16, label %bb29 - -bb16: ; preds = %bb13 - %tmp17 = icmp sgt i32 %arg, 0 - br i1 %tmp17, label %bb18, label %bb28 - -bb18: ; preds = %bb16 - br label %bb19 - -bb19: ; preds = %bb19, %bb18 - %tmp20 = phi i64 [ 0, %bb18 ], [ %tmp24, %bb19 ] - %tmp21 = getelementptr inbounds i32, ptr %arg1, i64 %tmp20 - %tmp22 = load i32, ptr %tmp21, align 4, !tbaa !2 - %tmp23 = add nsw i32 %tmp22, 1 - store i32 %tmp23, ptr %tmp21, align 4, !tbaa !2 - %tmp24 = add nuw nsw i64 %tmp20, 1 - %tmp25 = zext i32 %arg to i64 - %tmp26 = icmp ne i64 %tmp24, %tmp25 - br i1 %tmp26, label %bb19, label %bb27 - -bb27: ; preds = %bb19 - br label %bb28 - -bb28: ; preds = %bb27, %bb16 - br label %bb29 - -bb29: ; preds = %bb28, %bb13 - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1 - -; Function Attrs: nounwind -declare i64 @clock() #2 - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1 - -; Function Attrs: nounwind uwtable -define void @foo2(i32 %arg, ptr %arg1) #0 { -bb: - br label %bb2 - -bb2: ; preds = %bb - %tmp = icmp sgt i32 %arg, 0 - br i1 %tmp, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb4, %bb3 - %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ] - %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5 - %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2 - %tmp8 = add nsw i32 %tmp7, 1 - store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2 - %tmp9 = add nuw nsw i64 %tmp5, 1 - %tmp10 = zext i32 %arg to i64 - %tmp11 = icmp ne i64 %tmp9, %tmp10 - br i1 %tmp11, label %bb4, label %bb12 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12, %bb2 - ret void -} - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } - -!llvm.module.flags = !{!0} -!llvm.ident = !{!1} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{!"clang version 5.0.0"} -!2 = !{!3, !3, i64 0} -!3 = !{!"int", !4, i64 0} -!4 = !{!"omnipotent char", !5, i64 0} -!5 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll b/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll deleted file mode 100644 --- a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll +++ /dev/null @@ -1,89 +0,0 @@ -; RUN: opt %loadPolly -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s \ -; RUN: -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll \ -; RUN: | FileCheck %s --check-prefix=HOST-IR - -; Test that we do recognise and codegen a kernel that has functions that can -; be mapped to NVIDIA's libdevice - -; REQUIRES: pollyacc - -; Check that we model the kernel as a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end - -; Check that the intrinsic call is present in the kernel IR. -; KERNEL-IR: %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_) -; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf) -; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf) - -; Powi and exp cannot be lowered directly. Rather, we expect them to be -; lowered by libdevice. -; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2) -; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi) - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - - -; void f(float *A, float *B, int N) { -; for(int i = 0; i < N; i++) { -; float tmp0 = A[i]; -; float expf = expf(tmp1); -; cosf = cosf(expf); -; logf = logf(cosf); -; powi = powi(logf, 2); -; exp = exp(powi); -; B[i] = logf; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(ptr %A, ptr %B, i32 %N) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp1 = icmp sgt i32 %N, 0 - br i1 %cmp1, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %A.arr.i = getelementptr inbounds float, ptr %A, i64 %indvars.iv - %A.arr.i.val = load float, ptr %A.arr.i, align 4 - ; Call to intrinsics that should be part of the kernel. - %expf = tail call float @expf(float %A.arr.i.val) - %cosf = tail call float @cosf(float %expf) - %logf = tail call float @logf(float %cosf) - %powi = tail call float @llvm.powi.f32.i32(float %logf, i32 2) - %exp = tail call float @llvm.exp.f32(float %powi) - %B.arr.i = getelementptr inbounds float, ptr %B, i64 %indvars.iv - store float %exp, ptr %B.arr.i, align 4 - - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %wide.trip.count = zext i32 %N to i64 - %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - -; Function Attrs: nounwind readnone -declare float @expf(float) #0 -declare float @cosf(float) #0 -declare float @logf(float) #0 -declare float @llvm.powi.f32.i32(float, i32) #0 -declare float @llvm.exp.f32(float) #0 - -attributes #0 = { nounwind readnone } - diff --git a/polly/test/GPGPU/live-range-reordering-with-privatization.ll b/polly/test/GPGPU/live-range-reordering-with-privatization.ll deleted file mode 100644 --- a/polly/test/GPGPU/live-range-reordering-with-privatization.ll +++ /dev/null @@ -1,78 +0,0 @@ - ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ -; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-code -disable-output \ -; RUN: < %s | FileCheck %s -check-prefix=CODE - -; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ -; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-kernel-ir -disable-output \ -; RUN: < %s | FileCheck %s -check-prefix=KERNELIR - -; REQUIRES: pollyacc - -; void f(const int *end, int *arr, const int *control, const int *readarr) { -; for (int i = 0; i < *end; i++) { -; int t = 0; -; if (*control > 3) { -; t += readarr[i]; -; } -; arr[i] = t; -; } -; } - -; This test case tests the ability to infer that `t` is local to each loop -; iteration, and can therefore be privatized. - -; CODE: # kernel0 -; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1) -; CODE-NEXT: if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) { -; CODE-NEXT: Stmt_for_body_last(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: if (tmp1 >= 4) -; CODE-NEXT: Stmt_if_then(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: Stmt_if_end(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: } - -; KERNELIR: %private_array = alloca i32 - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.12.0" - -define void @f(ptr %end, ptr %arr, ptr %control, ptr %readarr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp3 = load i32, ptr %end, align 4 - %cmp4 = icmp sgt i32 %tmp3, 0 - br i1 %cmp4, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %if.end - %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ] - %tmp1 = load i32, ptr %control, align 4 - %cmp1 = icmp sgt i32 %tmp1, 3 - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, ptr %readarr, i32 %i.05 - %tmp2 = load i32, ptr %arrayidx, align 4 - br label %if.end - -if.end: ; preds = %if.then, %for.body - %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ] - %arrayidx2 = getelementptr inbounds i32, ptr %arr, i32 %i.05 - store i32 %t.0, ptr %arrayidx2, align 4 - %inc = add nuw nsw i32 %i.05, 1 - %tmp = load i32, ptr %end, align 4 - %cmp = icmp slt i32 %inc, %tmp - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %if.end - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - diff --git a/polly/test/GPGPU/loops-outside-scop.ll b/polly/test/GPGPU/loops-outside-scop.ll deleted file mode 100644 --- a/polly/test/GPGPU/loops-outside-scop.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; There is no FileCheck because we want to make sure that this doesn't crash. -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ -; RUN: -disable-output < %s - -; REQUIRES: pollyacc - -; Due to the existence of the `fence` call, We can only detect the inner loop -; and not the outer loop. PPCGCodeGeneration had not implemented this case. -; The fix was to pull the implementation from `IslNodeBuilder. - -; Make sure that we only capture the inner loop -; SCOP: Function: f -; SCOP-NEXT: Region: %for2.body---%for2.body.fence -; SCOP-NEXT: Max Loop Depth: 1 - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -declare void @fn_to_fence(ptr %val) - -; void f(int *arr, bool shouldcont) { -; for(int i = 0; ; i++) { -; for(int j = 0; j < 10; j++) { -; arr[j] = i; -; } -; fence(arr); -; if (!shouldcont) break; -; } -; } - - -; Function Attrs: nounwind uwtable -define void @f(ptr %arr, i1 %shouldcont) #1 { -entry: - br label %for.init - -for.init: ; preds = %for.end, %entry.split - %i = phi i32 [ %i.next, %for.end ], [ 0, %entry ] - br label %for2.body - -for2.body: ; preds = %"65", %"64" - %j = phi i32 [ %j.next, %for2.body ], [ 0, %for.init ] - %j.sext = sext i32 %j to i64 - %arr.slot = getelementptr i32, ptr %arr, i64 %j.sext - store i32 %i, ptr %arr.slot, align 4 - %exitcond = icmp eq i32 %j, 10 - %j.next = add i32 %j, 1 - br i1 %exitcond, label %for2.body.fence, label %for2.body - -for2.body.fence: ; preds = %"65" - call void @fn_to_fence(ptr %arr) #2 - br i1 %shouldcont, label %for.end, label %exit -for.end: ; preds = %"69" - %i.next = add i32 %i, 1 - br label %for.init - -exit: ; preds = %"69" - ret void - -} - - -attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind uwtable } -attributes #2 = { nounwind } diff --git a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll b/polly/test/GPGPU/managed-memory-rewrite-alloca.ll deleted file mode 100644 --- a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ -; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP: i32 MemRef_arr[*]; - -; Check that we generate a constructor call for @A.toptr -; HOST-IR-NOT: %arr = alloca [100 x i32] - -source_filename = "test.c" -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - - -define void @f() { -entry: - %arr = alloca [100 x i32] - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds [100 x i32], ptr %arr, i64 0, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll deleted file mode 100644 --- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll +++ /dev/null @@ -1,93 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR -; -; REQUIRES: pollyacc -; -; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and -; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory` -; pass, even inside `constantExpr`. This is necessary because a cookie cutter -; Inst->replaceUsesOfWith(...) call does not actually work, because this does -; not replace the instruction within a ConstantExpr. -; -; #include -; -; static const int N = 100; -; int* f(int *ToFree) { -; free(ToFree); -; int *A = (int *)malloc(sizeof(int) * N); -; for(int i = 0; i < N; i++) { -; A[i] = 42; -; } -; return A; -; -; } - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; SCOP: Arrays { -; SCOP-NEXT: i32 MemRef_tmp[*]; // Element size 4 -; SCOP-NEXT: } - -; // Check that polly_mallocManaged is declared and used correctly. -; HOST-IR: declare ptr @polly_mallocManaged(i64) - -; // Check that polly_freeManaged is declared and used correctly. -; HOST-IR call void @polly_freeManaged(i8* %toFree) -; HOST-IR: declare void @polly_freeManaged(ptr) - -; // Check that we remove the original malloc,free -; HOST-IR-NOT: declare ptr @malloc(i64) -; HOST-IR-NOT: declare void @free(ptr) - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -define ptr @f(ptr %toFree) { -entry: - ; Free inside bitcast - call void @free (ptr %toFree) - br label %entry.split - -entry.split: ; preds = %entry - ; malloc inside bitcast. - %tmp = call ptr @malloc (i64 400) - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %tmp, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret ptr %tmp -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - -declare ptr @malloc(i64) -declare void @free(ptr) - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll deleted file mode 100644 --- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR -; -; REQUIRES: pollyacc -; -; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and -; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory` -; pass. -; -; #include -; -; static const int N = 100; -; int* f(int *ToFree) { -; free(ToFree); -; int *A = (int *)malloc(sizeof(int) * N); -; for(int i = 0; i < N; i++) { -; A[i] = 42; -; } -; return A; -; -; } - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; SCOP: Arrays { -; SCOP-NEXT: i32 MemRef_call[*]; // Element size 4 -; SCOP-NEXT: } - -; // Check that polly_mallocManaged is declared and used correctly. -; HOST-IR: %call = tail call ptr @polly_mallocManaged(i64 400) -; HOST-IR: declare ptr @polly_mallocManaged(i64) - -; // Check that polly_freeManaged is declared and used correctly. -; HOST-IR %toFreeBitcast = bitcast i32* %toFree to i8* -; HOST-IR call void @polly_freeManaged(i8* %toFreeBitcast) -; HOST-IR: declare void @polly_freeManaged(ptr) - -; // Check that we remove the original malloc,free -; HOST-IR-NOT: declare ptr @malloc(i64) -; HOST-IR-NOT: declare void @free(ptr) - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -define ptr @f(ptr %toFree) { -entry: - call void @free(ptr %toFree) - br label %entry.split - -entry.split: ; preds = %entry - %call = tail call ptr @malloc(i64 400) - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret ptr %call -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - -declare ptr @malloc(i64) -declare void @free(ptr) - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/memory-only-referenced-from-access.ll b/polly/test/GPGPU/memory-only-referenced-from-access.ll deleted file mode 100644 --- a/polly/test/GPGPU/memory-only-referenced-from-access.ll +++ /dev/null @@ -1,44 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -polly-invariant-load-hoisting -polly-ignore-aliasing \ -; RUN: -polly-process-unprofitable -polly-ignore-parameter-bounds \ -; RUN: -polly-acc-fail-on-verify-module-failure \ -; RUN: -polly-acc-codegen-managed-memory \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; Verify that we correctly generate a kernel even if certain invariant load -; hoisted parameters appear only in memory accesses, but not domain elements. - -; CHECK: @FUNC_quux_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_tmp4, i32 %tmp3, i32 %tmp, i32 %tmp31, i32 %tmp2) - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -%struct.hoge = type { ptr, i64, i64, [1 x %struct.widget] } -%struct.widget = type { i64, i64, i64 } - -@global = external unnamed_addr global %struct.hoge, align 32 - -define void @quux(ptr noalias %arg, ptr noalias %arg1) { -bb: - %tmp = load i32, ptr %arg, align 4 - %tmp2 = sext i32 %tmp to i64 - %tmp3 = load i32, ptr %arg1, align 4 - %tmp4 = load ptr, ptr @global, align 32 - br label %bb5 - -bb5: ; preds = %bb5, %bb - %tmp6 = phi i32 [ %tmp11, %bb5 ], [ 0, %bb ] - %tmp7 = sext i32 %tmp6 to i64 - %tmp8 = sub nsw i64 %tmp7, %tmp2 - %tmp9 = getelementptr [0 x double], ptr %tmp4, i64 0, i64 %tmp8 - store double undef, ptr %tmp9, align 8 - %tmp10 = icmp eq i32 %tmp6, %tmp3 - %tmp11 = add i32 %tmp6, 1 - br i1 %tmp10, label %bb12, label %bb5 - -bb12: ; preds = %bb5 - ret void -} diff --git a/polly/test/GPGPU/mostly-sequential.ll b/polly/test/GPGPU/mostly-sequential.ll deleted file mode 100644 --- a/polly/test/GPGPU/mostly-sequential.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; void foo(float A[]) { -; for (long i = 0; i < 128; i++) -; A[i] += i; -; -; for (long i = 0; i < 128; i++) -; for (long j = 0; j < 128; j++) -; A[42] += i + j; -; } - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(4); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock; -; CODE-NEXT: dim3 k1_dimGrid; -; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (128) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb4(32 * b0 + t0); - -; CODE: # kernel1 -; CODE-NEXT: for (int c0 = 0; c0 <= 127; c0 += 1) -; CODE-NEXT: for (int c1 = 0; c1 <= 127; c1 += 1) -; CODE-NEXT: Stmt_bb14(c0, c1); - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A) { -bb: - br label %bb3 - -bb3: ; preds = %bb8, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ] - %exitcond2 = icmp ne i64 %i.0, 128 - br i1 %exitcond2, label %bb4, label %bb10 - -bb4: ; preds = %bb3 - %tmp = sitofp i64 %i.0 to float - %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp6 = load float, ptr %tmp5, align 4 - %tmp7 = fadd float %tmp6, %tmp - store float %tmp7, ptr %tmp5, align 4 - br label %bb8 - -bb8: ; preds = %bb4 - %tmp9 = add nuw nsw i64 %i.0, 1 - br label %bb3 - -bb10: ; preds = %bb3 - br label %bb11 - -bb11: ; preds = %bb23, %bb10 - %i1.0 = phi i64 [ 0, %bb10 ], [ %tmp24, %bb23 ] - %exitcond1 = icmp ne i64 %i1.0, 128 - br i1 %exitcond1, label %bb12, label %bb25 - -bb12: ; preds = %bb11 - br label %bb13 - -bb13: ; preds = %bb20, %bb12 - %j.0 = phi i64 [ 0, %bb12 ], [ %tmp21, %bb20 ] - %exitcond = icmp ne i64 %j.0, 128 - br i1 %exitcond, label %bb14, label %bb22 - -bb14: ; preds = %bb13 - %tmp15 = add nuw nsw i64 %i1.0, %j.0 - %tmp16 = sitofp i64 %tmp15 to float - %tmp17 = getelementptr inbounds float, ptr %A, i64 42 - %tmp18 = load float, ptr %tmp17, align 4 - %tmp19 = fadd float %tmp18, %tmp16 - store float %tmp19, ptr %tmp17, align 4 - br label %bb20 - -bb20: ; preds = %bb14 - %tmp21 = add nuw nsw i64 %j.0, 1 - br label %bb13 - -bb22: ; preds = %bb13 - br label %bb23 - -bb23: ; preds = %bb22 - %tmp24 = add nuw nsw i64 %i1.0, 1 - br label %bb11 - -bb25: ; preds = %bb11 - ret void -} diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll deleted file mode 100644 --- a/polly/test/GPGPU/non-read-only-scalars.ll +++ /dev/null @@ -1,168 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR -; -; REQUIRES: pollyacc -; -; #include -; -; float foo(float A[]) { -; float sum = 0; -; -; for (long i = 0; i < 32; i++) -; A[i] = i; -; -; for (long i = 0; i < 32; i++) -; A[i] += i; -; -; for (long i = 0; i < 32; i++) -; sum += A[i]; -; -; return sum; -; } -; -; int main() { -; float A[32]; -; float sum = foo(A); -; printf("%f\n", sum); -; } - -; CODE: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock; -; CODE-NEXT: dim3 k1_dimGrid; -; CODE-NEXT: kernel1 <<>> (dev_MemRef_sum_0__phi); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k2_dimBlock; -; CODE-NEXT: dim3 k2_dimGrid; -; CODE-NEXT: kernel2 <<>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0__phi)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: { -; CODE-NEXT: Stmt_bb4(t0); -; CODE-NEXT: Stmt_bb10(t0); -; CODE-NEXT: } - -; CODE: # kernel1 -; CODE-NEXT: Stmt_bb17(); - -; CODE: # kernel2 -; TODO-NEXT: { -; TODO-NEXT: read(); -; TODO-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { -; TODO-NEXT: Stmt_bb18(c0); -; TODO-NEXT: if (c0 <= 31) -; TODO-NEXT: Stmt_bb20(c0); -; TODO-NEXT: } -; TODO-NEXT: write(); -; TODO-NEXT: } - - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(ptr addrspace(1) %MemRef_sum_0__phi) -; KERNEL-IR: store float 0.000000e+00, ptr %sum.0.phiops -; KERNEL-IR: [[REGA:%.+]] = addrspacecast ptr addrspace(1) %MemRef_sum_0__phi to ptr -; KERNEL-IR: [[REGB:%.+]] = load float, ptr %sum.0.phiops -; KERNEL-IR: store float [[REGB]], ptr [[REGA]] - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2(ptr addrspace(1) %MemRef_A, ptr addrspace(1) %MemRef_sum_0__phi, ptr addrspace(1) %MemRef_sum_0) - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1 - -define float @foo(ptr %A) { -bb: - br label %bb3 - -bb3: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %exitcond2 = icmp ne i64 %i.0, 32 - br i1 %exitcond2, label %bb4, label %bb8 - -bb4: ; preds = %bb3 - %tmp = sitofp i64 %i.0 to float - %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0 - store float %tmp, ptr %tmp5, align 4 - br label %bb6 - -bb6: ; preds = %bb4 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb3 - -bb8: ; preds = %bb3 - br label %bb9 - -bb9: ; preds = %bb15, %bb8 - %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ] - %exitcond1 = icmp ne i64 %i1.0, 32 - br i1 %exitcond1, label %bb10, label %bb17 - -bb10: ; preds = %bb9 - %tmp11 = sitofp i64 %i1.0 to float - %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0 - %tmp13 = load float, ptr %tmp12, align 4 - %tmp14 = fadd float %tmp13, %tmp11 - store float %tmp14, ptr %tmp12, align 4 - br label %bb15 - -bb15: ; preds = %bb10 - %tmp16 = add nuw nsw i64 %i1.0, 1 - br label %bb9 - -bb17: ; preds = %bb9 - br label %bb18 - -bb18: ; preds = %bb20, %bb17 - %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ] - %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ] - %exitcond = icmp ne i64 %i2.0, 32 - br i1 %exitcond, label %bb19, label %bb25 - -bb19: ; preds = %bb18 - br label %bb20 - -bb20: ; preds = %bb19 - %tmp21 = getelementptr inbounds float, ptr %A, i64 %i2.0 - %tmp22 = load float, ptr %tmp21, align 4 - %tmp23 = fadd float %sum.0, %tmp22 - %tmp24 = add nuw nsw i64 %i2.0, 1 - br label %bb18 - -bb25: ; preds = %bb18 - %sum.0.lcssa = phi float [ %sum.0, %bb18 ] - ret float %sum.0.lcssa -} - -define i32 @main() { -bb: - %A = alloca [32 x float], align 16 - %tmp1 = call float @foo(ptr %A) - %tmp2 = fpext float %tmp1 to double - %tmp3 = call i32 (ptr, ...) @printf(ptr @.str, double %tmp2) #2 - ret i32 0 -} - -declare i32 @printf(ptr, ...) #1 - diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll deleted file mode 100644 --- a/polly/test/GPGPU/non-zero-array-offset.ll +++ /dev/null @@ -1,116 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR -; -; REQUIRES: pollyacc - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice)); - -; CODE: dim3 k0_dimBlock(8); -; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_B); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock(8); -; CODE-NEXT: dim3 k1_dimGrid(1); -; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb3(t0); - -; CODE: # kernel1 -; CODE-NEXT: Stmt_bb11(t0); - -; IR: %p_dev_array_MemRef_B = call ptr @polly_allocateMemoryForDevice(i64 32) -; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 32) -; IR-NEXT: [[REG0:%.+]] = getelementptr float, ptr %B, i64 8 -; IR-NEXT: call void @polly_copyFromHostToDevice(ptr [[REG0]], ptr %p_dev_array_MemRef_B, i64 32) - -; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_B) -; IR-NEXT: [[REGC:%.+]] = getelementptr float, ptr [[REGA]], i64 -8 - -; void foo(float A[], float B[]) { -; for (long i = 0; i < 8; i++) -; B[i + 8] *= 4; -; -; for (long i = 0; i < 8; i++) -; A[i] *= 12; -; } -; -; #ifdef OUTPUT -; int main() { -; float A[16]; -; -; for (long i = 0; i < 16; i++) { -; __sync_synchronize(); -; A[i] = i; -; } -; -; foo(A, A); -; -; float sum = 0; -; for (long i = 0; i < 16; i++) { -; __sync_synchronize(); -; sum += A[i]; -; } -; -; printf("%f\n", sum); -; } -; #endif -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb2 - -bb2: ; preds = %bb7, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ] - %exitcond1 = icmp ne i64 %i.0, 8 - br i1 %exitcond1, label %bb3, label %bb9 - -bb3: ; preds = %bb2 - %tmp = add nuw nsw i64 %i.0, 8 - %tmp4 = getelementptr inbounds float, ptr %B, i64 %tmp - %tmp5 = load float, ptr %tmp4, align 4 - %tmp6 = fmul float %tmp5, 4.000000e+00 - store float %tmp6, ptr %tmp4, align 4 - br label %bb7 - -bb7: ; preds = %bb3 - %tmp8 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb9: ; preds = %bb2 - br label %bb10 - -bb10: ; preds = %bb15, %bb9 - %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ] - %exitcond = icmp ne i64 %i1.0, 8 - br i1 %exitcond, label %bb11, label %bb17 - -bb11: ; preds = %bb10 - %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0 - %tmp13 = load float, ptr %tmp12, align 4 - %tmp14 = fmul float %tmp13, 1.200000e+01 - store float %tmp14, ptr %tmp12, align 4 - br label %bb15 - -bb15: ; preds = %bb11 - %tmp16 = add nuw nsw i64 %i1.0, 1 - br label %bb10 - -bb17: ; preds = %bb10 - ret void -} diff --git a/polly/test/GPGPU/only-part-of-array-modified.ll b/polly/test/GPGPU/only-part-of-array-modified.ll deleted file mode 100644 --- a/polly/test/GPGPU/only-part-of-array-modified.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s -; -; REQUIRES: pollyacc -; -; void foo(float A[], float B[]) { -; for (long i = 0; i < 1024; i++) -; A[2 * i] = B[i]; -; } - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i32), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2047) * sizeof(i32), cudaMemcpyHostToDevice)); - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb1 - -bb1: ; preds = %bb8, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb10 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds float, ptr %B, i64 %i.0 - %tmp4 = load i32, ptr %tmp, align 4 - %tmp5 = shl nsw i64 %i.0, 1 - %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5 - store i32 %tmp4, ptr %tmp6, align 4 - br label %bb8 - -bb8: ; preds = %bb2 - %tmp9 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb10: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/parametric-loop-bound.ll b/polly/test/GPGPU/parametric-loop-bound.ll deleted file mode 100644 --- a/polly/test/GPGPU/parametric-loop-bound.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; REQUIRES: pollyacc - -; void foo(long A[], long n) { -; for (long i = 0; i < n; i++) -; A[i] += 100; -; } - -; CODE: if (n >= 1) { -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (n) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(n >= 1048545 ? 32768 : (n + 31) / 32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, n); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (n) * sizeof(i64), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: for (int c0 = 0; c0 <= (n - 32 * b0 - 1) / 1048576; c0 += 1) -; CODE-NEXT: if (n >= 32 * b0 + t0 + 1048576 * c0 + 1) -; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0); - -; IR: store i64 %n, ptr %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1 -; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGA]] - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, i64 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb8 - -bb2: ; preds = %bb1 - %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp4 = load i64, ptr %tmp3, align 8 - %tmp5 = add nsw i64 %tmp4, 100 - store i64 %tmp5, ptr %tmp3, align 8 - br label %bb6 - -bb6: ; preds = %bb2 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb8: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/partial_writes.ll b/polly/test/GPGPU/partial_writes.ll deleted file mode 100644 --- a/polly/test/GPGPU/partial_writes.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: opt %loadPolly -polly-import-jscop -polly-codegen-ppcg -polly-stmt-granularity=bb -S < %s \ -; RUN: | FileCheck %s - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; CHECK: polly_launchKernel - -; Function Attrs: nounwind uwtable -define void @partial_writes() { -bb: - %tmp = tail call ptr @wibble() #2 - br label %bb2 - -bb2: ; preds = %bb11, %bb - %tmp3 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %tmp4 = getelementptr inbounds [1200 x double], ptr %tmp, i64 0, i64 %tmp3 - %tmp5 = load double, ptr %tmp4, align 8, !tbaa !1 - br label %bb6 - -bb6: ; preds = %bb6, %bb2 - %tmp7 = phi double [ undef, %bb2 ], [ undef, %bb6 ] - %tmp8 = phi i64 [ 0, %bb2 ], [ %tmp9, %bb6 ] - store double undef, ptr %tmp4, align 8, !tbaa !1 - %tmp9 = add nuw nsw i64 %tmp8, 1 - %tmp10 = icmp eq i64 %tmp9, 900 - br i1 %tmp10, label %bb11, label %bb6 - -bb11: ; preds = %bb6 - %tmp12 = add nuw nsw i64 %tmp3, 1 - %tmp13 = icmp eq i64 %tmp12, 1200 - br i1 %tmp13, label %bb14, label %bb2 - -bb14: ; preds = %bb11 - ret void -} - -declare ptr @wibble() - - -!llvm.ident = !{!0} - -!0 = !{!"clang version 6.0.0 (trunk 309912) (llvm/trunk 309933)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"double", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop b/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop deleted file mode 100644 --- a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop +++ /dev/null @@ -1,47 +0,0 @@ -{ - "arrays" : [ - { - "name" : "MemRef_tmp", - "sizes" : [ "*" ], - "type" : "double" - } - ], - "context" : "{ : }", - "name" : "%bb2---%bb14", - "statements" : [ - { - "accesses" : [ - { - "kind" : "read", - "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }" - }, - { - "kind" : "write", - "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }" - } - ], - "domain" : "{ Stmt_bb2[i0] : 0 <= i0 <= 1199 }", - "name" : "Stmt_bb2", - "schedule" : "{ Stmt_bb2[i0] -> [i0, 0, 0] }" - }, - { - "accesses" : [ - { - "kind" : "write", - "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] : i1 <= 898 }" - }, - { - "kind" : "read", - "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }" - }, - { - "kind" : "write", - "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }" - } - ], - "domain" : "{ Stmt_bb6[i0, i1] : 0 <= i0 <= 1199 and 0 <= i1 <= 899 }", - "name" : "Stmt_bb6", - "schedule" : "{ Stmt_bb6[i0, i1] -> [i0, 1, i1] }" - } - ] -} diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll deleted file mode 100644 --- a/polly/test/GPGPU/phi-nodes-in-kernel.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; REQUIRES: pollyacc - -; Approximate C source: -; void kernel_dynprog(int c[50]) { -; int iter = 0; -; int outl = 0; -; -; while(1) { -; for(int indvar = 1 ; indvar <= 49; indvar++) { -; c[indvar] = undef; -; } -; add78 = c[49] + outl; -; inc80 = iter + 1; -; -; if (true) break; -; -; outl = add78; -; iter = inc80; -; } -;} -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; CODE: cudaCheckReturn(cudaMalloc((void **) &dev_MemRef_c, (50) * sizeof(i32))); - -; CODE: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(2); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_c); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_c)); - -; CODE: # kernel0 -; CODE-NEXT: if (32 * b0 + t0 <= 48) -; CODE-NEXT: Stmt_for_body17(0, 32 * b0 + t0); - -; IR-LABEL: call void @polly_freeKernel -; IR: [[REGC:%.+]] = bitcast i32* %{{[0-9]+}} to i8* -; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196) - -; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c) #0 { -; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %9 -; KERNEL-IR-NEXT: store i32 422, i32 addrspace(1)* %polly.access.MemRef_c, align 4 - -define void @kernel_dynprog([50 x i32]* %c) { -entry: - %arrayidx77 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 49 - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.cond15.for.cond12.loopexit_crit_edge, %entry - %out_l.055 = phi i32 [ 0, %entry ], [ %add78, %for.cond15.for.cond12.loopexit_crit_edge ] - %iter.054 = phi i32 [ 0, %entry ], [ %inc80, %for.cond15.for.cond12.loopexit_crit_edge ] - br label %for.body17 - -for.cond15.for.cond12.loopexit_crit_edge: ; preds = %for.body17 - %tmp = load i32, i32* %arrayidx77, align 4 - %add78 = add nsw i32 %tmp, %out_l.055 - %inc80 = add nuw nsw i32 %iter.054, 1 - br i1 false, label %for.cond1.preheader, label %for.end81 - -for.body17: ; preds = %for.body17, %for.cond1.preheader - %indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ] - %arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71 - store i32 422, i32* %arrayidx69, align 4 - %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 - %lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32 - %exitcond75 = icmp ne i32 %lftr.wideiv74, 50 - br i1 %exitcond75, label %for.body17, label %for.cond15.for.cond12.loopexit_crit_edge - -for.end81: ; preds = %for.cond15.for.cond12.loopexit_crit_edge - ret void -} diff --git a/polly/test/GPGPU/private-memory.ll b/polly/test/GPGPU/private-memory.ll deleted file mode 100644 --- a/polly/test/GPGPU/private-memory.ll +++ /dev/null @@ -1,82 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-private \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-acc-use-private \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; void add(float *A) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 10; j++) -; A[i] += 1; -; } - -; CODE: # kernel0 -; CODE: { -; CODE: read(t0); -; CODE: for (int c3 = 0; c3 <= 9; c3 += 1) -; CODE: Stmt_bb5(t0, c3); -; CODE: write(t0); -; CODE: } - -; KERNEL: %private_array = alloca [1 x float] - -; KERNEL: %polly.access.cast.private_array = bitcast [1 x float]* %private_array to float* -; KERNEL-NEXT: %polly.access.private_array = getelementptr float, float* %polly.access.cast.private_array, i64 0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0 -; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A -; KERNEL-NEXT: store float %shared.read, float* %polly.access.private_array - -; KERNEL: %polly.access.cast.private_array5 = bitcast [1 x float]* %private_array to float* -; KERNEL-NEXT: %polly.access.private_array6 = getelementptr float, float* %polly.access.cast.private_array5, i64 0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A7 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A8 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A7, i64 %t0 -; KERNEL-NEXT: %shared.write = load float, float* %polly.access.private_array6 -; KERNEL-NEXT: store float %shared.write, float addrspace(1)* %polly.access.MemRef_A8 - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @add(float* %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb11, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb8, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %j.0, 10 - br i1 %exitcond, label %bb5, label %bb10 - -bb5: ; preds = %bb4 - %tmp = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp6 = load float, float* %tmp, align 4 - %tmp7 = fadd float %tmp6, 1.000000e+00 - store float %tmp7, float* %tmp, align 4 - br label %bb8 - -bb8: ; preds = %bb5 - %tmp9 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb10: ; preds = %bb4 - br label %bb11 - -bb11: ; preds = %bb10 - %tmp12 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb13: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll deleted file mode 100644 --- a/polly/test/GPGPU/privatization-simple.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; void f(int A[], int B[], int control, int C[]) { -; int x; -; #pragma scop -; for(int i = 0; i < 1000; i ++) { -; x = 0; -; if(control) x = C[i]; -; B[i] = x * A[i]; -; -; } -; #pragma endscop -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(ptr %A, ptr %B, i32 %control, ptr %C) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %if.end - %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] - %tobool = icmp eq i32 %control, 0 - br i1 %tobool, label %if.end, label %if.then - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv - %tmp4 = load i32, ptr %arrayidx, align 4 - br label %if.end - -if.end: ; preds = %for.body, %if.then - %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] - %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv - %tmp8 = load i32, ptr %arrayidx2, align 4 - %mul = mul nsw i32 %tmp8, %x.0 - %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv - store i32 %mul, ptr %arrayidx4, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %for.body, label %for.end - -for.end: ; preds = %if.end - ret void -} diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll deleted file mode 100644 --- a/polly/test/GPGPU/privatization.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: checkPrivatization -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; -; -; void checkPrivatization(int A[], int B[], int C[], int control) { -; int x; -; #pragma scop -; for (int i = 0; i < 1000; i++) { -; x = 0; -; if (control) -; x += C[i]; -; -; B[i] = x * A[i]; -; } -; #pragma endscop -; } -; -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @checkPrivatization(ptr %A, ptr %B, ptr %C, i32 %control) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %if.end - %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] - %tobool = icmp eq i32 %control, 0 - br i1 %tobool, label %if.end, label %if.then - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv - %tmp4 = load i32, ptr %arrayidx, align 4 - br label %if.end - -if.end: ; preds = %for.body, %if.then - %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] - %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv - %tmp9 = load i32, ptr %arrayidx2, align 4 - %mul = mul nsw i32 %tmp9, %x.0 - %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv - store i32 %mul, ptr %arrayidx4, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %for.body, label %for.end - -for.end: ; preds = %if.end - ret void -} diff --git a/polly/test/GPGPU/region-stmt.ll b/polly/test/GPGPU/region-stmt.ll deleted file mode 100644 --- a/polly/test/GPGPU/region-stmt.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (128) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(4); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, dev_MemRef_B); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (128) * sizeof(float), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: Stmt_for_body__TO__if_end(32 * b0 + t0); - -; IR: @polly_initContext - -; KERNEL-IR: kernel_0 - -; REQUIRES: pollyacc - -; void foo(float A[], float B[]) { -; for (long i = 0; i < 128; i++) -; if (A[i] == 42) -; B[i] += 2 * i; -; else -; B[i] += 4 * i; -; } -; -source_filename = "/tmp/test.c" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] - %exitcond = icmp ne i64 %i.0, 128 - br i1 %exitcond, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %arrayidx = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp = load float, ptr %arrayidx, align 4 - %cmp1 = fcmp oeq float %tmp, 4.200000e+01 - br i1 %cmp1, label %if.then, label %if.else - -if.then: ; preds = %for.body - %mul = shl nsw i64 %i.0, 1 - %conv = sitofp i64 %mul to float - %arrayidx2 = getelementptr inbounds float, ptr %B, i64 %i.0 - %tmp1 = load float, ptr %arrayidx2, align 4 - %add = fadd float %tmp1, %conv - store float %add, ptr %arrayidx2, align 4 - br label %if.end - -if.else: ; preds = %for.body - %mul3 = shl nsw i64 %i.0, 2 - %conv4 = sitofp i64 %mul3 to float - %arrayidx5 = getelementptr inbounds float, ptr %B, i64 %i.0 - %tmp2 = load float, ptr %arrayidx5, align 4 - %add6 = fadd float %tmp2, %conv4 - store float %add6, ptr %arrayidx5, align 4 - br label %if.end - -if.end: ; preds = %if.else, %if.then - br label %for.inc - -for.inc: ; preds = %if.end - %inc = add nuw nsw i64 %i.0, 1 - br label %for.cond - -for.end: ; preds = %for.cond - ret void -} diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll deleted file mode 100644 --- a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4 -; KERNEL-IR-NEXT: br label %polly.merge - -define void @kernel_dynprog(ptr %sum_c) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry - br label %for.body3 - -for.cond1.loopexit: ; preds = %for.end - %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1 - %exitcond57 = icmp ne i64 %indvars.iv.next56, 49 - br i1 %exitcond57, label %for.body3, label %for.inc55 - -for.body3: ; preds = %for.cond1.loopexit, %for.cond1.preheader - %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ] - %indvars.iv48 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next49, %for.cond1.loopexit ] - %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1 - %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv48, i64 %indvars.iv55 - store i32 0, ptr %arrayidx10, align 4 - %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv48 - br label %for.end - -for.end: ; preds = %for.body3 - br label %for.cond1.loopexit - -for.inc55: ; preds = %for.cond1.loopexit - ret void -} diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll deleted file mode 100644 --- a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; REQUIRES: pollyacc - -; Ensure that no dead instructions are emitted between the store and the -; branch instruction of the ScopStmt. At some point, our dead-code-elimination -; did not remove code that was inserted to compute the old (unused) branch -; condition. This code referred to CPU registers and consequently resulted -; in invalid bitcode. - -; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4 -; KERNEL-IR-NEXT: br label %polly.merge - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @kernel_dynprog(ptr %sum_c) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry - br label %for.body3 - -for.cond4.for.cond1.loopexit_crit_edge: ; preds = %for.end - br label %for.cond1.loopexit - -for.cond1.loopexit: ; preds = %for.cond4.for.cond1.loopexit_crit_edge - br i1 undef, label %for.body3, label %for.inc55 - -for.body3: ; preds = %for.cond1.loopexit, %for.cond1.preheader - %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ] - %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1 - br label %for.body6 - -for.body6: ; preds = %for.end, %for.body3 - %indvars.iv50 = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next51, %for.end ] - %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 %indvars.iv55 - store i32 0, ptr %arrayidx10, align 4 - %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv50 - br i1 %cmp1334, label %for.body14.lr.ph, label %for.end - -for.body14.lr.ph: ; preds = %for.body6 - br label %for.body14 - -for.body14: ; preds = %for.body14, %for.body14.lr.ph - %arrayidx32 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 0 - br i1 false, label %for.body14, label %for.cond12.for.end_crit_edge - -for.cond12.for.end_crit_edge: ; preds = %for.body14 - br label %for.end - -for.end: ; preds = %for.cond12.for.end_crit_edge, %for.body6 - %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1 - %lftr.wideiv53 = trunc i64 %indvars.iv.next51 to i32 - %exitcond54 = icmp ne i32 %lftr.wideiv53, 50 - br i1 %exitcond54, label %for.body6, label %for.cond4.for.cond1.loopexit_crit_edge - -for.inc55: ; preds = %for.cond1.loopexit - unreachable -} diff --git a/polly/test/GPGPU/run-time-check.ll b/polly/test/GPGPU/run-time-check.ll deleted file mode 100644 --- a/polly/test/GPGPU/run-time-check.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR -; -; REQUIRES: pollyacc -; -; void foo(long n, float A[][32]) { -; for (long i = 0; i < n; i++) -; for (long j = 0; j < n; j++) -; A[i][j] += A[i + 1][j + 1]; -; } - -; IR: %tmp = icmp slt i64 %i.0, %n -; IR-NEXT: br i1 %tmp, label %bb2, label %polly.merge_new_and_old - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(i64 %n, ptr %A) { -bb: - br label %bb1 - -bb1: ; preds = %bb15, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp16, %bb15 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb17 - -bb2: ; preds = %bb1 - br label %bb3 - -bb3: ; preds = %bb12, %bb2 - %j.0 = phi i64 [ 0, %bb2 ], [ %tmp13, %bb12 ] - %exitcond = icmp ne i64 %j.0, %n - br i1 %exitcond, label %bb4, label %bb14 - -bb4: ; preds = %bb3 - %tmp5 = add nuw nsw i64 %j.0, 1 - %tmp6 = add nuw nsw i64 %i.0, 1 - %tmp7 = getelementptr inbounds [32 x float], ptr %A, i64 %tmp6, i64 %tmp5 - %tmp8 = load float, ptr %tmp7, align 4 - %tmp9 = getelementptr inbounds [32 x float], ptr %A, i64 %i.0, i64 %j.0 - %tmp10 = load float, ptr %tmp9, align 4 - %tmp11 = fadd float %tmp10, %tmp8 - store float %tmp11, ptr %tmp9, align 4 - br label %bb12 - -bb12: ; preds = %bb4 - %tmp13 = add nuw nsw i64 %j.0, 1 - br label %bb3 - -bb14: ; preds = %bb3 - br label %bb15 - -bb15: ; preds = %bb14 - %tmp16 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb17: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll b/polly/test/GPGPU/scalar-param-and-value-32-bit.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} -; -; void foo(float A[], int n) { -; for (long j = 0; j < n; j++) -; A[j + n] += 42; -; } - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i32 %n) - -define void @foo(ptr %A, i32 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb9, %bb - %j.0 = phi i64 [ 0, %bb ], [ %tmp10, %bb9 ] - %tmp = sext i32 %n to i64 - %tmp2 = icmp slt i64 %j.0, %tmp - br i1 %tmp2, label %bb3, label %bb11 - -bb3: ; preds = %bb1 - %tmp4 = sext i32 %n to i64 - %tmp5 = add nsw i64 %j.0, %tmp4 - %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5 - %tmp7 = load float, ptr %tmp6, align 4 - %tmp8 = fadd float %tmp7, 4.200000e+01 - store float %tmp8, ptr %tmp6, align 4 - br label %bb9 - -bb9: ; preds = %bb3 - %tmp10 = add nuw nsw i64 %j.0, 1 - br label %bb1 - -bb11: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-param-and-value-use.ll b/polly/test/GPGPU/scalar-param-and-value-use.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-param-and-value-use.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; void foo(long n, float A[][n]) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 32; j++) -; A[i][j] += A[i + 1][j + 1]; -; } - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; This test case failed at some point as %n was only available in this kernel -; when referenced through an isl_id in an isl ast expression, but not when -; it was referenced from a SCEV or instruction that not part of any loop -; bound. - -; IR: %polly.access.mul.MemRef_A = mul nsw i64 {{.*}}, %n - -define void @foo(i64 %n, ptr %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb19, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp20, %bb19 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb21 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb16, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp17, %bb16 ] - %exitcond = icmp ne i64 %j.0, 32 - br i1 %exitcond, label %bb5, label %bb18 - -bb5: ; preds = %bb4 - %tmp = add nuw nsw i64 %j.0, 1 - %tmp6 = add nuw nsw i64 %i.0, 1 - %tmp7 = mul nsw i64 %tmp6, %n - %tmp8 = getelementptr inbounds float, ptr %A, i64 %tmp7 - %tmp9 = getelementptr inbounds float, ptr %tmp8, i64 %tmp - %tmp10 = load float, ptr %tmp9, align 4 - %tmp11 = mul nsw i64 %i.0, %n - %tmp12 = getelementptr inbounds float, ptr %A, i64 %tmp11 - %tmp13 = getelementptr inbounds float, ptr %tmp12, i64 %j.0 - %tmp14 = load float, ptr %tmp13, align 4 - %tmp15 = fadd float %tmp14, %tmp10 - store float %tmp15, ptr %tmp13, align 4 - br label %bb16 - -bb16: ; preds = %bb5 - %tmp17 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb18: ; preds = %bb4 - br label %bb19 - -bb19: ; preds = %bb18 - %tmp20 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb21: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-fp128.ll b/polly/test/GPGPU/scalar-parameter-fp128.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-fp128.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "LowerFormalArguments didn't emit the correct number of values!" - -; void foo(fp128 A[], fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @fp128(ptr %A, fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0 - %tmp3 = load fp128, ptr %tmp, align 4 - %tmp4 = fadd fp128 %tmp3, %b - store fp128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-half.ll b/polly/test/GPGPU/scalar-parameter-half.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-half.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; void foo(half A[], half b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @half(ptr %A, half %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds half, ptr %A, i64 %i.0 - %tmp3 = load half, ptr %tmp, align 4 - %tmp4 = fadd half %tmp3, %b - store half %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-i120.ll b/polly/test/GPGPU/scalar-parameter-i120.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-i120.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" - -; void foo(i120 A[], i120 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i120(ptr %A, i120 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i120 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i120, ptr %A, i120 %i.0 - %tmp3 = load i120, ptr %tmp, align 4 - %tmp4 = add i120 %tmp3, %b - store i120 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i120 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-i128.ll b/polly/test/GPGPU/scalar-parameter-i128.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-i128.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; void foo(i128 A[], i128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i128(ptr %A, i128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i128 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i128, ptr %A, i128 %i.0 - %tmp3 = load i128, ptr %tmp, align 4 - %tmp4 = add i128 %tmp3, %b - store i128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i128 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-i3000.ll b/polly/test/GPGPU/scalar-parameter-i3000.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-i3000.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" - -; void foo(i3000 A[], i3000 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i3000(ptr %A, i3000 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i3000 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i3000, ptr %A, i3000 %i.0 - %tmp3 = load i3000, ptr %tmp, align 4 - %tmp4 = add i3000 %tmp3, %b - store i3000 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i3000 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-i80.ll b/polly/test/GPGPU/scalar-parameter-i80.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-i80.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" - -; void foo(i80 A[], i80 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i80(ptr %A, i80 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i80 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i80, ptr %A, i80 %i.0 - %tmp3 = load i80, ptr %tmp, align 4 - %tmp4 = add i80 %tmp3, %b - store i80 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i80 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "LowerFormalArguments didn't emit the correct number of values!" - -; void foo(fp128 A[], fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @ppc_fp128(ptr %A, ppc_fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds ppc_fp128, ptr %A, i64 %i.0 - %tmp3 = load ppc_fp128, ptr %tmp, align 4 - %tmp4 = fadd ppc_fp128 %tmp3, %b - store ppc_fp128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "LowerFormalArguments didn't emit the correct number of values!" - -; void foo(fp128 A[], fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @fp128(ptr %A, fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0 - %tmp3 = load fp128, ptr %tmp, align 4 - %tmp4 = fadd fp128 %tmp3, %b - store fp128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-parameter.ll +++ /dev/null @@ -1,411 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today due to extensive output differences from when the test was written. - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, float %MemRef_b) - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(float A[], float b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @float(ptr %A, float %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp3 = load float, ptr %tmp, align 4 - %tmp4 = fadd float %tmp3, %b - store float %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, double %MemRef_b) -; KERNEL-NEXT: entry: -; KERNEL-NEXT: %b.s2a = alloca double -; KERNEL-NEXT: store double %MemRef_b, ptr %b.s2a - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(double A[], double b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @double(ptr %A, double %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds double, ptr %A, i64 %i.0 - %tmp3 = load double, ptr %tmp, align 4 - %tmp4 = fadd double %tmp3, %b - store double %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i1), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i1 A[], i1 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i1(ptr %A, i1 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i1, ptr %A, i64 %i.0 - %tmp3 = load i1, ptr %tmp, align 4 - %tmp4 = add i1 %tmp3, %b - store i1 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i3 A[], i3 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i3(ptr %A, i3 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i3, ptr %A, i64 %i.0 - %tmp3 = load i3, ptr %tmp, align 4 - %tmp4 = add i3 %tmp3, %b - store i3 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i8), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i8), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i8 A[], i32 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i8(ptr %A, i8 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i8, ptr %A, i64 %i.0 - %tmp3 = load i8, ptr %tmp, align 4 - %tmp4 = add i8 %tmp3, %b - store i8 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; IR-LABEL: @i8 - -; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A) -; IR-NEXT: store ptr [[REGA:%.+]], ptr %polly_launch_0_param_0 -; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params -; IR-NEXT: store i8 %b, ptr %polly_launch_0_param_1 -; IR-NEXT: [[REGD:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1 -; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGD]] - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i32), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i32 A[], i32 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i32(ptr %A, i32 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i32, ptr %A, i64 %i.0 - %tmp3 = load i32, ptr %tmp, align 4 - %tmp4 = add i32 %tmp3, %b - store i32 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i60), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i60), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i60 A[], i60 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i60(ptr %A, i60 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i60, ptr %A, i64 %i.0 - %tmp3 = load i60, ptr %tmp, align 4 - %tmp4 = add i60 %tmp3, %b - store i60 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i64 A[], i64 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i64(ptr %A, i64 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp3 = load i64, ptr %tmp, align 4 - %tmp4 = add i64 %tmp3, %b - store i64 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll b/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll deleted file mode 100644 --- a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: opt %loadPolly -polly-acc-dump-code -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-code -polly-stmt-granularity=bb \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting -polly-stmt-granularity=bb < %s \ -; RUN: | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: { Stmt_loop_a[i0] -> MemRef_p[0] }; -; SCOP-NEXT: Execution Context: { : } -; SCOP-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: { -; CODE-NEXT: if (32 * b0 + t0 <= 1025) { -; CODE-NEXT: Stmt_loop(32 * b0 + t0); -; CODE-NEXT: write(0); -; CODE-NEXT: } -; CODE-NEXT: sync0(); -; CODE-NEXT: } - -; Check that we generate a correct "always false" branch. -; HOST-IR: br i1 false, label %polly.start, label %loop.pre_entry_bb - -; This test case checks that we generate correct code if PPCGCodeGeneration -; decides a build is unsuccessful with invariant load hoisting enabled. -; -; There is a conditional branch which switches between the original code and -; the new code. We try to set this conditional branch to branch on false. -; However, invariant load hoisting changes the structure of the scop, so we -; need to change the way we *locate* this instruction. - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.12.0" - -define void @foo(ptr %A, ptr %p) { -entry: - br label %loop - -loop: - %indvar = phi i64 [0, %entry], [%indvar.next, %loop] - %indvar.next = add i64 %indvar, 1 - %invariant = load float, ptr %p - %ptr = getelementptr float, ptr %A, i64 %indvar - store float 42.0, ptr %ptr - %cmp = icmp sle i64 %indvar, 1024 - br i1 %cmp, label %loop, label %loop2 - -loop2: - %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2] - %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2] - %indvar2.next = add i64 %indvar2, 1 - store float %indvar2f, ptr %A - %cmp2 = icmp sle i64 %indvar2, 1024 - br i1 %cmp2, label %loop2, label %end - -end: - ret void -} diff --git a/polly/test/GPGPU/scheduler-timeout.ll b/polly/test/GPGPU/scheduler-timeout.ll deleted file mode 100644 --- a/polly/test/GPGPU/scheduler-timeout.ll +++ /dev/null @@ -1,174 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; This test case took at some point forever to schedule, as the isl scheduler -; seems to have problems if domain constraints appear in the dependences -; provided to the scheduler. - -; /* D := alpha*A*B*C + beta*D */ -; for (i = 0; i < _PB_NI; i++) -; for (j = 0; j < _PB_NJ; j++) -; { -; tmp[i][j] = 0; -; for (k = 0; k < _PB_NK; ++k) -; tmp[i][j] += alpha * A[i][k] * B[k][j]; -; } -; for (i = 0; i < _PB_NI; i++) -; for (j = 0; j < _PB_NL; j++) -; { -; D[i][j] *= beta; -; for (k = 0; k < _PB_NJ; ++k) -; D[i][j] += tmp[i][k] * C[k][j]; -; } - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(16, 32); -; CODE-NEXT: dim3 k0_dimGrid(128, 128); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock(16, 32); -; CODE-NEXT: dim3 k1_dimGrid(128, 128); -; CODE-NEXT: kernel1 <<>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) -; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { -; CODE-NEXT: if (c2 == 0) -; CODE-NEXT: Stmt_for_body6(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); -; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) -; CODE-NEXT: Stmt_for_body11(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); -; CODE-NEXT: } - -; CODE: # kernel1 -; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) -; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { -; CODE-NEXT: if (c2 == 0) -; CODE-NEXT: Stmt_for_body36(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); -; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) -; CODE-NEXT: Stmt_for_body44(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); -; CODE-NEXT: } - - - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start(i64, ptr nocapture) #0 - -; Function Attrs: nounwind uwtable -define internal void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, float %alpha, float %beta, ptr %tmp, ptr %A, ptr %B, ptr %C, ptr %D) #1 { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry.split, %for.inc28 - %indvars.iv19 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next20, %for.inc28 ] - br label %for.body6 - -for.cond31.preheader: ; preds = %for.inc28 - br label %for.cond34.preheader - -for.body6: ; preds = %for.cond4.preheader, %for.inc25 - %indvars.iv16 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next17, %for.inc25 ] - %arrayidx8 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16 - store float 0.000000e+00, ptr %arrayidx8, align 4, !tbaa !1 - br label %for.body11 - -for.body11: ; preds = %for.body6, %for.body11 - %indvars.iv13 = phi i64 [ 0, %for.body6 ], [ %indvars.iv.next14, %for.body11 ] - %arrayidx15 = getelementptr inbounds [4096 x float], ptr %A, i64 %indvars.iv19, i64 %indvars.iv13 - %tmp22 = load float, ptr %arrayidx15, align 4, !tbaa !1 - %mul = fmul float %tmp22, %alpha - %arrayidx19 = getelementptr inbounds [4096 x float], ptr %B, i64 %indvars.iv13, i64 %indvars.iv16 - %tmp23 = load float, ptr %arrayidx19, align 4, !tbaa !1 - %mul20 = fmul float %mul, %tmp23 - %arrayidx24 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16 - %tmp24 = load float, ptr %arrayidx24, align 4, !tbaa !1 - %add = fadd float %tmp24, %mul20 - store float %add, ptr %arrayidx24, align 4, !tbaa !1 - %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 - %exitcond15 = icmp ne i64 %indvars.iv.next14, 4096 - br i1 %exitcond15, label %for.body11, label %for.inc25 - -for.inc25: ; preds = %for.body11 - %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 - %exitcond18 = icmp ne i64 %indvars.iv.next17, 4096 - br i1 %exitcond18, label %for.body6, label %for.inc28 - -for.inc28: ; preds = %for.inc25 - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond21 = icmp ne i64 %indvars.iv.next20, 4096 - br i1 %exitcond21, label %for.cond4.preheader, label %for.cond31.preheader - -for.cond34.preheader: ; preds = %for.cond31.preheader, %for.inc65 - %indvars.iv10 = phi i64 [ 0, %for.cond31.preheader ], [ %indvars.iv.next11, %for.inc65 ] - br label %for.body36 - -for.body36: ; preds = %for.cond34.preheader, %for.inc62 - %indvars.iv7 = phi i64 [ 0, %for.cond34.preheader ], [ %indvars.iv.next8, %for.inc62 ] - %arrayidx40 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7 - %tmp25 = load float, ptr %arrayidx40, align 4, !tbaa !1 - %mul41 = fmul float %tmp25, %beta - store float %mul41, ptr %arrayidx40, align 4, !tbaa !1 - br label %for.body44 - -for.body44: ; preds = %for.body36, %for.body44 - %indvars.iv = phi i64 [ 0, %for.body36 ], [ %indvars.iv.next, %for.body44 ] - %arrayidx48 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv10, i64 %indvars.iv - %tmp26 = load float, ptr %arrayidx48, align 4, !tbaa !1 - %arrayidx52 = getelementptr inbounds [4096 x float], ptr %C, i64 %indvars.iv, i64 %indvars.iv7 - %tmp27 = load float, ptr %arrayidx52, align 4, !tbaa !1 - %mul53 = fmul float %tmp26, %tmp27 - %arrayidx57 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7 - %tmp28 = load float, ptr %arrayidx57, align 4, !tbaa !1 - %add58 = fadd float %tmp28, %mul53 - store float %add58, ptr %arrayidx57, align 4, !tbaa !1 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 4096 - br i1 %exitcond, label %for.body44, label %for.inc62 - -for.inc62: ; preds = %for.body44 - %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 - %exitcond9 = icmp ne i64 %indvars.iv.next8, 4096 - br i1 %exitcond9, label %for.body36, label %for.inc65 - -for.inc65: ; preds = %for.inc62 - %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 - %exitcond12 = icmp ne i64 %indvars.iv.next11, 4096 - br i1 %exitcond12, label %for.cond34.preheader, label %for.end67 - -for.end67: ; preds = %for.inc65 - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!llvm.ident = !{!0} - -!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"float", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/shared-memory-scalar.ll b/polly/test/GPGPU/shared-memory-scalar.ll deleted file mode 100644 --- a/polly/test/GPGPU/shared-memory-scalar.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; void add(float *A, float alpha) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 10; j++) -; A[i] += alpha; -; } - -; CODE: read(t0); -; CODE-NEXT: sync0(); -; CODE-NEXT: for (int c3 = 0; c3 <= 9; c3 += 1) -; CODE-NEXT: Stmt_bb5(t0, c3); -; CODE-NEXT: sync1(); -; CODE-NEXT: write(t0); - -; This test case was intended to test code generation for scalars stored -; in shared memory. However, after properly marking the scalar as read-only -; the scalar is not stored any more in shared memory. We still leave this -; test case as documentation if we every forget to mark scalars as read-only. - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @add(ptr %A, float %alpha) { -bb: - br label %bb2 - -bb2: ; preds = %bb11, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb8, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %j.0, 10 - br i1 %exitcond, label %bb5, label %bb10 - -bb5: ; preds = %bb4 - %tmp = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp6 = load float, ptr %tmp, align 4 - %tmp7 = fadd float %tmp6, %alpha - store float %tmp7, ptr %tmp, align 4 - br label %bb8 - -bb8: ; preds = %bb5 - %tmp9 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb10: ; preds = %bb4 - br label %bb11 - -bb11: ; preds = %bb10 - %tmp12 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb13: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/shared-memory-two-dimensional.ll b/polly/test/GPGPU/shared-memory-two-dimensional.ll deleted file mode 100644 --- a/polly/test/GPGPU/shared-memory-two-dimensional.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; void foo(float A[], float b[][8]) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 16; j++) -; for (long k = 0; k < 8; k++) -; A[i] += j * k * b[j][k]; -; } - - -; CODE: # kernel0 -; CODE-NEXT: { -; CODE-NEXT: if (t0 <= 7) -; CODE-NEXT: for (int c0 = 0; c0 <= 15; c0 += 1) -; CODE-NEXT: read(c0, t0); -; CODE-NEXT: read(t0); -; CODE-NEXT: sync0(); -; CODE-NEXT: for (int c3 = 0; c3 <= 15; c3 += 1) -; CODE-NEXT: for (int c4 = 0; c4 <= 7; c4 += 1) -; CODE-NEXT: Stmt_bb8(t0, c3, c4); -; CODE-NEXT: sync1(); -; CODE-NEXT: write(t0); -; CODE-NEXT: } - -; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4 - -; KERNEL: %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8 -; KERNEL-NEXT: %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0 -; KERNEL-NEXT: %polly.access.MemRef_b = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b -; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_b -; KERNEL-NEXT: store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(float* %A, [8 x float]* %b) { -bb: - br label %bb3 - -bb3: ; preds = %bb22, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ] - %exitcond2 = icmp ne i64 %i.0, 32 - br i1 %exitcond2, label %bb4, label %bb24 - -bb4: ; preds = %bb3 - br label %bb5 - -bb5: ; preds = %bb19, %bb4 - %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ] - %exitcond1 = icmp ne i64 %j.0, 16 - br i1 %exitcond1, label %bb6, label %bb21 - -bb6: ; preds = %bb5 - br label %bb7 - -bb7: ; preds = %bb16, %bb6 - %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ] - %exitcond = icmp ne i64 %k.0, 8 - br i1 %exitcond, label %bb8, label %bb18 - -bb8: ; preds = %bb7 - %tmp = mul nuw nsw i64 %j.0, %k.0 - %tmp9 = sitofp i64 %tmp to float - %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0 - %tmp11 = load float, float* %tmp10, align 4 - %tmp12 = fmul float %tmp9, %tmp11 - %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp14 = load float, float* %tmp13, align 4 - %tmp15 = fadd float %tmp14, %tmp12 - store float %tmp15, float* %tmp13, align 4 - br label %bb16 - -bb16: ; preds = %bb8 - %tmp17 = add nuw nsw i64 %k.0, 1 - br label %bb7 - -bb18: ; preds = %bb7 - br label %bb19 - -bb19: ; preds = %bb18 - %tmp20 = add nuw nsw i64 %j.0, 1 - br label %bb5 - -bb21: ; preds = %bb5 - br label %bb22 - -bb22: ; preds = %bb21 - %tmp23 = add nuw nsw i64 %i.0, 1 - br label %bb3 - -bb24: ; preds = %bb3 - ret void -} diff --git a/polly/test/GPGPU/shared-memory.ll b/polly/test/GPGPU/shared-memory.ll deleted file mode 100644 --- a/polly/test/GPGPU/shared-memory.ll +++ /dev/null @@ -1,83 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; void add(float *A) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 10; j++) -; A[i] += 1; -; } - -; CODE: # kernel0 -; CODE: { -; CODE: read(t0); -; CODE: sync0(); -; CODE: for (int c3 = 0; c3 <= 9; c3 += 1) -; CODE: Stmt_bb5(t0, c3); -; CODE: sync1(); -; CODE: write(t0); -; CODE: } - -; KERNEL: @shared_MemRef_A = internal addrspace(3) global [32 x float] zeroinitializer, align 4 - -; KERNEL: %polly.access.shared_MemRef_A = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0 -; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A -; KERNEL-NEXT: store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_A - -; KERNEL: %polly.access.shared_MemRef_A3 = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A4 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A4, i64 %t0 -; KERNEL-NEXT: %shared.write = load float, float addrspace(3)* %polly.access.shared_MemRef_A3 -; KERNEL-NEXT: store float %shared.write, float addrspace(1)* %polly.access.MemRef_A5 - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @add(float* %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb11, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb8, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %j.0, 10 - br i1 %exitcond, label %bb5, label %bb10 - -bb5: ; preds = %bb4 - %tmp = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp6 = load float, float* %tmp, align 4 - %tmp7 = fadd float %tmp6, 1.000000e+00 - store float %tmp7, float* %tmp, align 4 - br label %bb8 - -bb8: ; preds = %bb5 - %tmp9 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb10: ; preds = %bb4 - br label %bb11 - -bb11: ; preds = %bb10 - %tmp12 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb13: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/simple-managed-memory-rewrite.ll b/polly/test/GPGPU/simple-managed-memory-rewrite.ll deleted file mode 100644 --- a/polly/test/GPGPU/simple-managed-memory-rewrite.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ -; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP: i32 MemRef_A[*]; - -; Check that we generate a constructor call for @A.toptr -; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr {{.*}}, ptr @A.toptr }] - -; Check that we generate a constructor -; 4 bytes * 100 = 400 -; HOST-IR: define void {{.*}}constructor() { -; HOST-IR-NEXT: entry: -; HOST-IR-NEXT: %mem.raw = call ptr @polly_mallocManaged(i64 400) -; HOST-IR-NEXT: store ptr %mem.raw, ptr @A.toptr -; HOST-IR-NEXT: ret void -; HOST-IR-NEXT: } - -; HOST-IR-NOT: @A - -source_filename = "test.c" -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -@A = internal global [100 x i32] zeroinitializer, align 16 - -define void @f() { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds [100 x i32], ptr @A, i64 0, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/size-cast.ll b/polly/test/GPGPU/size-cast.ll deleted file mode 100644 --- a/polly/test/GPGPU/size-cast.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; REQUIRES: pollyacc - -; This test case ensures that we properly sign-extend the types we are using. - -; CODE: if (arg >= 1 && arg1 == 0) { -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_arg3, MemRef_arg3, (arg) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(arg >= 1048545 ? 32768 : (arg + 31) / 32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_arg3, dev_MemRef_arg2, arg, arg1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_arg2, dev_MemRef_arg2, (arg) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg3)); -; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg2)); - -; CODE: # kernel0 -; CODE-NEXT: for (int c0 = 0; c0 <= (arg - 32 * b0 - 1) / 1048576; c0 += 1) -; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1) -; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0); - -; IR-LABEL: call ptr @polly_initContextCUDA() -; IR: sext i32 %arg to i64 -; IR-NEXT: mul i64 -; IR-NEXT: @polly_allocateMemoryForDevice - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @hoge(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3) { -bb: - br label %bb4 - -bb4: ; preds = %bb13, %bb - br label %bb6 - -bb5: ; preds = %bb13 - ret void - -bb6: ; preds = %bb6, %bb4 - %tmp = phi i64 [ 0, %bb4 ], [ %tmp10, %bb6 ] - %tmp7 = getelementptr inbounds double, ptr %arg3, i64 %tmp - %tmp8 = load double, ptr %tmp7, align 8 - %tmp9 = getelementptr inbounds [1000 x double], ptr %arg2, i64 0, i64 %tmp - store double %tmp8, ptr %tmp9, align 8 - %tmp10 = add nuw nsw i64 %tmp, 1 - %tmp11 = zext i32 %arg to i64 - %tmp12 = icmp ne i64 %tmp10, %tmp11 - br i1 %tmp12, label %bb6, label %bb13 - -bb13: ; preds = %bb6 - %tmp14 = zext i32 %arg1 to i64 - %tmp15 = icmp ne i64 0, %tmp14 - br i1 %tmp15, label %bb4, label %bb5 -} diff --git a/polly/test/GPGPU/spir-codegen.ll b/polly/test/GPGPU/spir-codegen.ll deleted file mode 100644 --- a/polly/test/GPGPU/spir-codegen.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-gpu-arch=spir32 \ -; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; CHECK: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -; CHECK-NEXT: target triple = "spir-unknown-unknown" - -; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { -; CHECK-NEXT: entry: -; CHECK-NEXT: %0 = call i32 @__gen_ocl_get_group_id0() -; CHECK-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64 -; CHECK-NEXT: %1 = call i32 @__gen_ocl_get_group_id1() -; CHECK-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64 -; CHECK-NEXT: %2 = call i32 @__gen_ocl_get_local_id0() -; CHECK-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64 -; CHECK-NEXT: %3 = call i32 @__gen_ocl_get_local_id1() -; CHECK-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64 -; CHECK-NEXT: br label %polly.loop_preheader - -; CHECK-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 -; CHECK-NEXT: ret void - -; CHECK-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader -; CHECK-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] -; CHECK-NEXT: %4 = mul nsw i64 32, %__gen_ocl_get_group_id0 -; CHECK-NEXT: %5 = add nsw i64 %4, %__gen_ocl_get_local_id0 -; CHECK-NEXT: %6 = mul nsw i64 32, %__gen_ocl_get_group_id1 -; CHECK-NEXT: %7 = add nsw i64 %6, %__gen_ocl_get_local_id1 -; CHECK-NEXT: %8 = mul nsw i64 16, %polly.indvar -; CHECK-NEXT: %9 = add nsw i64 %7, %8 -; CHECK-NEXT: br label %polly.stmt.bb5 - -; CHECK-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header -; CHECK-NEXT: %10 = mul i64 %5, %9 -; CHECK-NEXT: %p_tmp6 = sitofp i64 %10 to float -; CHECK-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; CHECK-NEXT: %11 = mul nsw i64 32, %__gen_ocl_get_group_id0 -; CHECK-NEXT: %12 = add nsw i64 %11, %__gen_ocl_get_local_id0 -; CHECK-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024 -; CHECK-NEXT: %13 = mul nsw i64 32, %__gen_ocl_get_group_id1 -; CHECK-NEXT: %14 = add nsw i64 %13, %__gen_ocl_get_local_id1 -; CHECK-NEXT: %15 = mul nsw i64 16, %polly.indvar -; CHECK-NEXT: %16 = add nsw i64 %14, %15 -; CHECK-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16 -; CHECK-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A -; CHECK-NEXT: %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4 -; CHECK-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 -; CHECK-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; CHECK-NEXT: %17 = mul nsw i64 32, %__gen_ocl_get_group_id0 -; CHECK-NEXT: %18 = add nsw i64 %17, %__gen_ocl_get_local_id0 -; CHECK-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024 -; CHECK-NEXT: %19 = mul nsw i64 32, %__gen_ocl_get_group_id1 -; CHECK-NEXT: %20 = add nsw i64 %19, %__gen_ocl_get_local_id1 -; CHECK-NEXT: %21 = mul nsw i64 16, %polly.indvar -; CHECK-NEXT: %22 = add nsw i64 %20, %21 -; CHECK-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22 -; CHECK-NEXT: %polly.access.MemRef_A4 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3 -; CHECK-NEXT: store float %p_tmp9, float addrspace(1)* %polly.access.MemRef_A4, align 4 -; CHECK-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 -; CHECK-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1 -; CHECK-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit - -; CHECK-LABEL: polly.loop_preheader: ; preds = %entry -; CHECK-NEXT: br label %polly.loop_header - -; CHECK: attributes #0 = { "polly.skip.fn" } - -; void double_parallel_loop(float A[][1024]) { -; for (long i = 0; i < 1024; i++) -; for (long j = 0; j < 1024; j++) -; A[i][j] += i * j; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @double_parallel_loop([1024 x float]* %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb13, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] - %exitcond1 = icmp ne i64 %i.0, 1024 - br i1 %exitcond1, label %bb3, label %bb15 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb10, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %j.0, 1024 - br i1 %exitcond, label %bb5, label %bb12 - -bb5: ; preds = %bb4 - %tmp = mul nuw nsw i64 %i.0, %j.0 - %tmp6 = sitofp i64 %tmp to float - %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0 - %tmp8 = load float, float* %tmp7, align 4 - %tmp9 = fadd float %tmp8, %tmp6 - store float %tmp9, float* %tmp7, align 4 - br label %bb10 - -bb10: ; preds = %bb5 - %tmp11 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12 - %tmp14 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb15: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/spir-typesize.ll b/polly/test/GPGPU/spir-typesize.ll deleted file mode 100644 --- a/polly/test/GPGPU/spir-typesize.ll +++ /dev/null @@ -1,90 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-gpu-arch=spir64 \ -; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ -; RUN: FileCheck -check-prefix=I64 %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-gpu-arch=spir32 \ -; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ -; RUN: FileCheck -check-prefix=I32 %s - -; REQUIRES: pollyacc - -; This test case checks whether the openCl runtime functions (get_local_id/get_group_id) return the right types for 32 and 64bit devices. - -; I32: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -; I32-NEXT: target triple = "spir-unknown-unknown" - -; I32-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { -; I32-NEXT: entry: -; I32-NEXT: %0 = call i32 @__gen_ocl_get_group_id0() -; I32-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64 -; I32-NEXT: %1 = call i32 @__gen_ocl_get_group_id1() -; I32-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64 -; I32-NEXT: %2 = call i32 @__gen_ocl_get_local_id0() -; I32-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64 -; I32-NEXT: %3 = call i32 @__gen_ocl_get_local_id1() -; I32-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64 -; I32-NEXT: br label %polly.loop_preheader - -; I64: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -; I64-next: target triple = "spir64-unknown-unknown" - -; I64-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { -; I64-NEXT: entry: -; I64-NEXT: %0 = call i64 @__gen_ocl_get_group_id0() -; I64-NEXT: %1 = call i64 @__gen_ocl_get_group_id1() -; I64-NEXT: %2 = call i64 @__gen_ocl_get_local_id0() -; I64-NEXT: %3 = call i64 @__gen_ocl_get_local_id1() -; I64-NEXT: br label %polly.loop_preheader - - -; void double_parallel_loop(float A[][1024]) { -; for (long i = 0; i < 1024; i++) -; for (long j = 0; j < 1024; j++) -; A[i][j] += i * j; -; } -; - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @double_parallel_loop(ptr %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb13, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] - %exitcond1 = icmp ne i64 %i.0, 1024 - br i1 %exitcond1, label %bb3, label %bb15 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb10, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %j.0, 1024 - br i1 %exitcond, label %bb5, label %bb12 - -bb5: ; preds = %bb4 - %tmp = mul nuw nsw i64 %i.0, %j.0 - %tmp6 = sitofp i64 %tmp to float - %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0 - %tmp8 = load float, ptr %tmp7, align 4 - %tmp9 = fadd float %tmp8, %tmp6 - store float %tmp9, ptr %tmp7, align 4 - br label %bb10 - -bb10: ; preds = %bb5 - %tmp11 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12 - %tmp14 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb15: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll b/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll deleted file mode 100644 --- a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll +++ /dev/null @@ -1,82 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s - -; Check that we do not create a kernel if there is an -; unknown function call in a candidate kernel. - -; Check that we model the kernel as a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end13 - -; If a kernel were generated, then this code would have been part of the kernel -; and not the `.ll` file that is generated. -; CHECK: %conv = fpext float %0 to double -; CHECK-NEXT: %1 = tail call double @extern.fn(double %conv) -; CHECK-NEXT: %conv6 = fptrunc double %1 to float - -; REQUIRES: pollyacc - -; static const int N = 1000; -; void f(float A[N][N], int n, float B[N][N]) { -; for(int i = 0; i < n; i++) { -; for(int j = 0; j < n; j++) { -; B[i][j] = extern_fn(A[i][j], 3); -; } -; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.11.0" - -define void @f(ptr %A, i32 %n, ptr %B) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp3 = icmp sgt i32 %n, 0 - br i1 %cmp3, label %for.cond1.preheader.lr.ph, label %for.end13 - -for.cond1.preheader.lr.ph: ; preds = %entry.split - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc11 - %indvars.iv5 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next6, %for.inc11 ] - %cmp21 = icmp sgt i32 %n, 0 - br i1 %cmp21, label %for.body3.lr.ph, label %for.inc11 - -for.body3.lr.ph: ; preds = %for.cond1.preheader - br label %for.body3 - -for.body3: ; preds = %for.body3.lr.ph, %for.body3 - %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [1000 x float], ptr %A, i64 %indvars.iv5, i64 %indvars.iv - %0 = load float, ptr %arrayidx5, align 4 - %conv = fpext float %0 to double - %1 = tail call double @extern.fn(double %conv) - %conv6 = fptrunc double %1 to float - %arrayidx10 = getelementptr inbounds [1000 x float], ptr %B, i64 %indvars.iv5, i64 %indvars.iv - store float %conv6, ptr %arrayidx10, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %wide.trip.count = zext i32 %n to i64 - %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond, label %for.body3, label %for.cond1.for.inc11_crit_edge - -for.cond1.for.inc11_crit_edge: ; preds = %for.body3 - br label %for.inc11 - -for.inc11: ; preds = %for.cond1.for.inc11_crit_edge, %for.cond1.preheader - %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 - %wide.trip.count7 = zext i32 %n to i64 - %exitcond8 = icmp ne i64 %indvars.iv.next6, %wide.trip.count7 - br i1 %exitcond8, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge - -for.cond.for.end13_crit_edge: ; preds = %for.inc11 - br label %for.end13 - -for.end13: ; preds = %for.cond.for.end13_crit_edge, %entry.split - ret void -} - -declare double @extern.fn(double) #0 -attributes #0 = { readnone } diff --git a/polly/test/GPGPU/untouched-arrays.ll b/polly/test/GPGPU/untouched-arrays.ll deleted file mode 100644 --- a/polly/test/GPGPU/untouched-arrays.ll +++ /dev/null @@ -1,270 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_global_1, MemRef_global_1, (142) * sizeof(i32), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(10); -; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_global_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_global_1, dev_MemRef_global_1, (142) * sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE: cudaCheckReturn(cudaFree(dev_MemRef_global_1)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb33(t0, 0); - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%struct.hoge = type { [23 x i16], [22 x i16], [14 x i16], [13 x i16] } - -@global = external global [9 x %struct.hoge], align 16 -@global.1 = external global [9 x [152 x i32]], align 16 - -; Function Attrs: nounwind uwtable -define void @widget() #0 { -bb: - br label %bb1 - -bb1: ; preds = %bb1, %bb - br i1 undef, label %bb1, label %bb2 - -bb2: ; preds = %bb2, %bb1 - br i1 undef, label %bb2, label %bb3 - -bb3: ; preds = %bb3, %bb2 - br i1 undef, label %bb3, label %bb4 - -bb4: ; preds = %bb4, %bb3 - br i1 undef, label %bb4, label %bb5 - -bb5: ; preds = %bb5, %bb4 - br i1 undef, label %bb5, label %bb6 - -bb6: ; preds = %bb6, %bb5 - br i1 undef, label %bb6, label %bb7 - -bb7: ; preds = %bb7, %bb6 - br i1 undef, label %bb7, label %bb8 - -bb8: ; preds = %bb8, %bb7 - br i1 undef, label %bb8, label %bb9 - -bb9: ; preds = %bb8 - br label %bb10 - -bb10: ; preds = %bb12, %bb9 - br label %bb11 - -bb11: ; preds = %bb11, %bb10 - br i1 undef, label %bb11, label %bb12 - -bb12: ; preds = %bb11 - br i1 undef, label %bb10, label %bb13 - -bb13: ; preds = %bb18, %bb12 - br i1 undef, label %bb16, label %bb14 - -bb14: ; preds = %bb16, %bb13 - br i1 undef, label %bb15, label %bb18 - -bb15: ; preds = %bb14 - br label %bb17 - -bb16: ; preds = %bb16, %bb13 - br i1 undef, label %bb16, label %bb14 - -bb17: ; preds = %bb17, %bb15 - br i1 undef, label %bb17, label %bb18 - -bb18: ; preds = %bb17, %bb14 - br i1 undef, label %bb13, label %bb19 - -bb19: ; preds = %bb25, %bb18 - br label %bb20 - -bb20: ; preds = %bb24, %bb19 - br i1 undef, label %bb21, label %bb24 - -bb21: ; preds = %bb20 - br i1 undef, label %bb23, label %bb22 - -bb22: ; preds = %bb21 - br label %bb24 - -bb23: ; preds = %bb21 - br label %bb24 - -bb24: ; preds = %bb23, %bb22, %bb20 - br i1 undef, label %bb20, label %bb25 - -bb25: ; preds = %bb24 - br i1 undef, label %bb19, label %bb26 - -bb26: ; preds = %bb56, %bb25 - %tmp = phi ptr [ undef, %bb56 ], [ getelementptr inbounds ([9 x [152 x i32]], ptr @global.1, i64 0, i64 0, i64 32), %bb25 ] - br label %bb27 - -bb27: ; preds = %bb27, %bb26 - br i1 undef, label %bb27, label %bb28 - -bb28: ; preds = %bb27 - br label %bb30 - -bb30: ; preds = %bb38, %bb28 - %tmp31 = phi i32 [ 3, %bb28 ], [ %tmp40, %bb38 ] - %tmp32 = phi ptr [ %tmp, %bb28 ], [ %tmp39, %bb38 ] - br label %bb33 - -bb33: ; preds = %bb33, %bb30 - %tmp34 = phi i32 [ 0, %bb30 ], [ %tmp37, %bb33 ] - %tmp35 = phi ptr [ %tmp32, %bb30 ], [ undef, %bb33 ] - %tmp36 = getelementptr inbounds i32, ptr %tmp35, i64 1 - store i32 undef, ptr %tmp36, align 4, !tbaa !1 - %tmp37 = add nuw nsw i32 %tmp34, 1 - br i1 false, label %bb33, label %bb38 - -bb38: ; preds = %bb33 - %tmp39 = getelementptr i32, ptr %tmp32, i64 12 - %tmp40 = add nuw nsw i32 %tmp31, 1 - %tmp41 = icmp ne i32 %tmp40, 13 - br i1 %tmp41, label %bb30, label %bb42 - -bb42: ; preds = %bb38 - %tmp43 = getelementptr inbounds [9 x %struct.hoge], ptr @global, i64 0, i64 0, i32 3, i64 0 - br label %bb44 - -bb44: ; preds = %bb51, %bb42 - %tmp45 = phi i32 [ 0, %bb42 ], [ %tmp52, %bb51 ] - %tmp46 = phi ptr [ %tmp43, %bb42 ], [ undef, %bb51 ] - %tmp47 = load i16, ptr %tmp46, align 2, !tbaa !5 - br label %bb48 - -bb48: ; preds = %bb48, %bb44 - %tmp49 = phi i32 [ 0, %bb44 ], [ %tmp50, %bb48 ] - %tmp50 = add nuw nsw i32 %tmp49, 1 - br i1 false, label %bb48, label %bb51 - -bb51: ; preds = %bb48 - %tmp52 = add nuw nsw i32 %tmp45, 1 - %tmp53 = icmp ne i32 %tmp52, 13 - br i1 %tmp53, label %bb44, label %bb54 - -bb54: ; preds = %bb51 - br label %bb55 - -bb55: ; preds = %bb55, %bb54 - br i1 undef, label %bb55, label %bb56 - -bb56: ; preds = %bb55 - br i1 undef, label %bb26, label %bb57 - -bb57: ; preds = %bb60, %bb56 - br label %bb58 - -bb58: ; preds = %bb58, %bb57 - br i1 undef, label %bb58, label %bb59 - -bb59: ; preds = %bb59, %bb58 - br i1 undef, label %bb59, label %bb60 - -bb60: ; preds = %bb59 - br i1 undef, label %bb57, label %bb61 - -bb61: ; preds = %bb65, %bb60 - br label %bb62 - -bb62: ; preds = %bb64, %bb61 - br label %bb63 - -bb63: ; preds = %bb63, %bb62 - br i1 undef, label %bb63, label %bb64 - -bb64: ; preds = %bb63 - br i1 undef, label %bb62, label %bb65 - -bb65: ; preds = %bb64 - br i1 undef, label %bb61, label %bb66 - -bb66: ; preds = %bb70, %bb65 - br label %bb67 - -bb67: ; preds = %bb69, %bb66 - br label %bb68 - -bb68: ; preds = %bb68, %bb67 - br i1 undef, label %bb68, label %bb69 - -bb69: ; preds = %bb68 - br i1 undef, label %bb67, label %bb70 - -bb70: ; preds = %bb69 - br i1 undef, label %bb66, label %bb71 - -bb71: ; preds = %bb73, %bb70 - br label %bb72 - -bb72: ; preds = %bb72, %bb71 - br i1 undef, label %bb72, label %bb73 - -bb73: ; preds = %bb72 - br i1 undef, label %bb71, label %bb74 - -bb74: ; preds = %bb80, %bb73 - br label %bb75 - -bb75: ; preds = %bb79, %bb74 - br label %bb76 - -bb76: ; preds = %bb78, %bb75 - br label %bb77 - -bb77: ; preds = %bb77, %bb76 - br i1 undef, label %bb77, label %bb78 - -bb78: ; preds = %bb77 - br i1 undef, label %bb76, label %bb79 - -bb79: ; preds = %bb78 - br i1 undef, label %bb75, label %bb80 - -bb80: ; preds = %bb79 - br i1 undef, label %bb74, label %bb81 - -bb81: ; preds = %bb85, %bb80 - br label %bb82 - -bb82: ; preds = %bb84, %bb81 - br label %bb83 - -bb83: ; preds = %bb83, %bb82 - br i1 undef, label %bb83, label %bb84 - -bb84: ; preds = %bb83 - br i1 undef, label %bb82, label %bb85 - -bb85: ; preds = %bb84 - br i1 undef, label %bb81, label %bb86 - -bb86: ; preds = %bb85 - ret void -} - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!llvm.ident = !{!0} - -!0 = !{!"clang version 4.0.0"} -!1 = !{!2, !2, i64 0} -!2 = !{!"int", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} -!5 = !{!6, !6, i64 0} -!6 = !{!"short", !3, i64 0} diff --git a/polly/test/Unit/lit.site.cfg.in b/polly/test/Unit/lit.site.cfg.in --- a/polly/test/Unit/lit.site.cfg.in +++ b/polly/test/Unit/lit.site.cfg.in @@ -11,7 +11,6 @@ config.polly_lib_dir = "@POLLY_LIB_DIR@" config.shlibdir = "@SHLIBDIR@" config.target_triple = "@LLVM_TARGET_TRIPLE@" -config.enable_gpgpu_codegen = "@GPU_CODEGEN@" config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@" config.has_unittests = @POLLY_GTEST_AVAIL@ diff --git a/polly/test/lit.cfg b/polly/test/lit.cfg --- a/polly/test/lit.cfg +++ b/polly/test/lit.cfg @@ -70,6 +70,4 @@ print("Could not find llvm-config in " + config.llvm_tools_dir) exit(42) -if re.search(r'NVPTX', llvm_config_cmd.stdout.read().decode('ascii')): - config.available_features.add('nvptx-registered-target') llvm_config_cmd.wait() diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in --- a/polly/test/lit.site.cfg.in +++ b/polly/test/lit.site.cfg.in @@ -7,7 +7,6 @@ config.polly_obj_root = "@POLLY_BINARY_DIR@" config.polly_lib_dir = "@POLLY_LIB_DIR@" config.target_triple = "@LLVM_TARGET_TRIPLE@" -config.enable_gpgpu_codegen = "@GPU_CODEGEN@" config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@" config.targets_to_build = "@TARGETS_TO_BUILD@" config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";") @@ -50,9 +49,6 @@ config.substitutions.append(('%loadNPMPolly', commonOpts )) -if config.enable_gpgpu_codegen == 'TRUE' : - config.available_features.add('pollyacc') - import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/polly/tools/CMakeLists.txt b/polly/tools/CMakeLists.txt deleted file mode 100644 --- a/polly/tools/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -if (CUDA_FOUND OR OpenCL_FOUND) - add_subdirectory(GPURuntime) -endif (CUDA_FOUND OR OpenCL_FOUND) - -set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) diff --git a/polly/tools/GPURuntime/CMakeLists.txt b/polly/tools/GPURuntime/CMakeLists.txt deleted file mode 100644 --- a/polly/tools/GPURuntime/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -set(MODULE TRUE) -set(LLVM_NO_RTTI 1) - -add_polly_library(GPURuntime - GPUJIT.c - ) - -set_target_properties(GPURuntime - PROPERTIES - LINKER_LANGUAGE C - PREFIX "lib" - ) - -set_property(TARGET GPURuntime PROPERTY C_STANDARD 99) - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=default ") -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-sanitize=all ") -endif() diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h deleted file mode 100644 --- a/polly/tools/GPURuntime/GPUJIT.h +++ /dev/null @@ -1,123 +0,0 @@ -/******************************************************************************/ -/* */ -/* Part of the LLVM Project, under the Apache License v2.0 with LLVM */ -/* Exceptions. */ -/* See https://llvm.org/LICENSE.txt for license information. */ -/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ -/* */ -/******************************************************************************/ -/* */ -/* This file defines GPUJIT. */ -/* */ -/******************************************************************************/ - -#ifndef GPUJIT_H_ -#define GPUJIT_H_ -#include "stddef.h" - -/* - * The following demonstrates how we can use the GPURuntime library to - * execute a GPU kernel. - * - * char KernelString[] = "\n\ - * .version 1.4\n\ - * .target sm_10, map_f64_to_f32\n\ - * .entry _Z8myKernelPi (\n\ - * .param .u64 __cudaparm__Z8myKernelPi_data)\n\ - * {\n\ - * .reg .u16 %rh<4>;\n\ - * .reg .u32 %r<5>;\n\ - * .reg .u64 %rd<6>;\n\ - * cvt.u32.u16 %r1, %tid.x;\n\ - * mov.u16 %rh1, %ctaid.x;\n\ - * mov.u16 %rh2, %ntid.x;\n\ - * mul.wide.u16 %r2, %rh1, %rh2;\n\ - * add.u32 %r3, %r1, %r2;\n\ - * ld.param.u64 %rd1, [__cudaparm__Z8myKernelPi_data];\n\ - * cvt.s64.s32 %rd2, %r3;\n\ - * mul.wide.s32 %rd3, %r3, 4;\n\ - * add.u64 %rd4, %rd1, %rd3;\n\ - * st.global.s32 [%rd4+0], %r3;\n\ - * exit;\n\ - * }\n\ - * "; - * - * const char *Entry = "_Z8myKernelPi"; - * - * int main() { - * PollyGPUFunction *Kernel; - * PollyGPUContext *Context; - * PollyGPUDevicePtr *DevArray; - * int *HostData; - * int MemSize; - * - * int GridX = 8; - * int GridY = 8; - * - * int BlockX = 16; - * int BlockY = 16; - * int BlockZ = 1; - * - * MemSize = 256*64*sizeof(int); - * Context = polly_initContext(); - * DevArray = polly_allocateMemoryForDevice(MemSize); - * Kernel = polly_getKernel(KernelString, KernelName); - * - * void *Params[1]; - * void *DevPtr = polly_getDevicePtr(DevArray) - * Params[0] = &DevPtr; - * - * polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params); - * - * polly_copyFromDeviceToHost(HostData, DevData, MemSize); - * polly_freeKernel(Kernel); - * polly_freeDeviceMemory(DevArray); - * polly_freeContext(Context); - * } - * - */ - -typedef enum PollyGPURuntimeT { - RUNTIME_NONE, - RUNTIME_CUDA, - RUNTIME_CL -} PollyGPURuntime; - -typedef struct PollyGPUContextT PollyGPUContext; -typedef struct PollyGPUFunctionT PollyGPUFunction; -typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; - -typedef struct OpenCLContextT OpenCLContext; -typedef struct OpenCLKernelT OpenCLKernel; -typedef struct OpenCLDevicePtrT OpenCLDevicePtr; - -typedef struct CUDAContextT CUDAContext; -typedef struct CUDAKernelT CUDAKernel; -typedef struct CUDADevicePtrT CUDADevicePtr; - -PollyGPUContext *polly_initContextCUDA(); -PollyGPUContext *polly_initContextCL(); -PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, - const char *KernelName); -void polly_freeKernel(PollyGPUFunction *Kernel); -void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize); -void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize); -void polly_synchronizeDevice(); -void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockSizeX, - unsigned int BlockSizeY, unsigned int BlockSizeZ, - void **Parameters); -void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation); -void polly_freeContext(PollyGPUContext *Context); - -// Note that polly_{malloc/free}Managed are currently not used by Polly. -// We use them in COSMO by replacing all malloc with polly_mallocManaged and all -// frees with cudaFree, so we can get managed memory "automatically". -// Needless to say, this is a hack. -// Please make sure that this code is not present in Polly when 2018 rolls in. -// If this is still present, ping Siddharth Bhat -void *polly_mallocManaged(size_t size); -void polly_freeManaged(void *mem); -#endif /* GPUJIT_H_ */ diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c deleted file mode 100644 --- a/polly/tools/GPURuntime/GPUJIT.c +++ /dev/null @@ -1,1856 +0,0 @@ -/******************** GPUJIT.c - GPUJIT Execution Engine **********************/ -/* */ -/* Part of the LLVM Project, under the Apache License v2.0 with LLVM */ -/* Exceptions. */ -/* See https://llvm.org/LICENSE.txt for license information. */ -/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ -/* */ -/******************************************************************************/ -/* */ -/* This file implements GPUJIT, a ptx string execution engine for GPU. */ -/* */ -/******************************************************************************/ - -#include "GPUJIT.h" - -#ifdef HAS_LIBCUDART -#include -#include -#endif /* HAS_LIBCUDART */ - -#ifdef HAS_LIBOPENCL -#ifdef __APPLE__ -#include -#else -#include -#endif /* __APPLE__ */ -#endif /* HAS_LIBOPENCL */ - -#include -#include -#include -#include -#include -#include -#include - -static int DebugMode; -static int CacheMode; -#define max(x, y) ((x) > (y) ? (x) : (y)) - -static PollyGPURuntime Runtime = RUNTIME_NONE; - -static void debug_print(const char *format, ...) { - if (!DebugMode) - return; - - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); -} -#define dump_function() debug_print("-> %s\n", __func__) - -#define KERNEL_CACHE_SIZE 10 - -static void err_runtime() __attribute__((noreturn)); -static void err_runtime() { - fprintf(stderr, "Runtime not correctly initialized.\n"); - exit(-1); -} - -struct PollyGPUContextT { - void *Context; -}; - -struct PollyGPUFunctionT { - void *Kernel; -}; - -struct PollyGPUDevicePtrT { - void *DevicePtr; -}; - -/******************************************************************************/ -/* OpenCL */ -/******************************************************************************/ -#ifdef HAS_LIBOPENCL - -struct OpenCLContextT { - cl_context Context; - cl_command_queue CommandQueue; -}; - -struct OpenCLKernelT { - cl_kernel Kernel; - cl_program Program; - const char *BinaryString; -}; - -struct OpenCLDevicePtrT { - cl_mem MemObj; -}; - -/* Dynamic library handles for the OpenCL runtime library. */ -static void *HandleOpenCL; -static void *HandleOpenCLBeignet; - -/* Type-defines of function pointer to OpenCL Runtime API. */ -typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, - cl_platform_id *Platforms, - cl_uint *NumPlatforms); -static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; - -typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform, - cl_device_type DeviceType, - cl_uint NumEntries, cl_device_id *Devices, - cl_uint *NumDevices); -static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; - -typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device, - cl_device_info ParamName, - size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet); -static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; - -typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName, - size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet); -static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; - -typedef cl_context clCreateContextFcnTy( - const cl_context_properties *Properties, cl_uint NumDevices, - const cl_device_id *Devices, - void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo, - size_t CB, void *UserData), - void *UserData, cl_int *ErrcodeRet); -static clCreateContextFcnTy *clCreateContextFcnPtr; - -typedef cl_command_queue -clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device, - cl_command_queue_properties Properties, - cl_int *ErrcodeRet); -static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; - -typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags, - size_t Size, void *HostPtr, - cl_int *ErrcodeRet); -static clCreateBufferFcnTy *clCreateBufferFcnPtr; - -typedef cl_int -clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer, - cl_bool BlockingWrite, size_t Offset, size_t Size, - const void *Ptr, cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, cl_event *Event); -static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; - -typedef cl_program -clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices, - const cl_device_id *DeviceList, - const char *Filename, cl_int *ErrcodeRet); -static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr; - -typedef cl_program clCreateProgramWithBinaryFcnTy( - cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, - const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, - cl_int *ErrcodeRet); -static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; - -typedef cl_int clBuildProgramFcnTy( - cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList, - const char *Options, - void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData), - void *UserData); -static clBuildProgramFcnTy *clBuildProgramFcnPtr; - -typedef cl_kernel clCreateKernelFcnTy(cl_program Program, - const char *KernelName, - cl_int *ErrcodeRet); -static clCreateKernelFcnTy *clCreateKernelFcnPtr; - -typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex, - size_t ArgSize, const void *ArgValue); -static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; - -typedef cl_int clEnqueueNDRangeKernelFcnTy( - cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim, - const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, - const size_t *LocalWorkSize, cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, cl_event *Event); -static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; - -typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue, - cl_mem Buffer, cl_bool BlockingRead, - size_t Offset, size_t Size, void *Ptr, - cl_uint NumEventsInWaitList, - const cl_event *EventWaitList, - cl_event *Event); -static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; - -typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue); -static clFlushFcnTy *clFlushFcnPtr; - -typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue); -static clFinishFcnTy *clFinishFcnPtr; - -typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel); -static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; - -typedef cl_int clReleaseProgramFcnTy(cl_program Program); -static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; - -typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject); -static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; - -typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue); -static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; - -typedef cl_int clReleaseContextFcnTy(cl_context Context); -static clReleaseContextFcnTy *clReleaseContextFcnPtr; - -static void *getAPIHandleCL(void *Handle, const char *FuncName) { - char *Err; - void *FuncPtr; - dlerror(); - FuncPtr = dlsym(Handle, FuncName); - if ((Err = dlerror()) != 0) { - fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err); - return 0; - } - return FuncPtr; -} - -static int initialDeviceAPILibrariesCL() { - HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY); - HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); - if (!HandleOpenCL) { - fprintf(stderr, "Cannot open library: %s. \n", dlerror()); - return 0; - } - return 1; -} - -/* Get function pointer to OpenCL Runtime API. - * - * Note that compilers conforming to the ISO C standard are required to - * generate a warning if a conversion from a void * pointer to a function - * pointer is attempted as in the following statements. The warning - * of this kind of cast may not be emitted by clang and new versions of gcc - * as it is valid on POSIX 2008. For compilers required to generate a warning, - * we temporarily disable -Wpedantic, to avoid bloating the output with - * unnecessary warnings. - * - * Reference: - * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html - */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wpedantic" -static int initialDeviceAPIsCL() { - if (initialDeviceAPILibrariesCL() == 0) - return 0; - - // FIXME: We are now always selecting the Intel Beignet driver if it is - // available on the system, instead of a possible NVIDIA or AMD OpenCL - // API. This selection should occurr based on the target architecture - // chosen when compiling. - void *Handle = - (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL); - - clGetPlatformIDsFcnPtr = - (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs"); - - clGetDeviceIDsFcnPtr = - (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs"); - - clGetDeviceInfoFcnPtr = - (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo"); - - clGetKernelInfoFcnPtr = - (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo"); - - clCreateContextFcnPtr = - (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext"); - - clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( - Handle, "clCreateCommandQueue"); - - clCreateBufferFcnPtr = - (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer"); - - clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( - Handle, "clEnqueueWriteBuffer"); - - if (HandleOpenCLBeignet) - clCreateProgramWithLLVMIntelFcnPtr = - (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL( - Handle, "clCreateProgramWithLLVMIntel"); - - clCreateProgramWithBinaryFcnPtr = - (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( - Handle, "clCreateProgramWithBinary"); - - clBuildProgramFcnPtr = - (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram"); - - clCreateKernelFcnPtr = - (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel"); - - clSetKernelArgFcnPtr = - (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg"); - - clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( - Handle, "clEnqueueNDRangeKernel"); - - clEnqueueReadBufferFcnPtr = - (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer"); - - clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush"); - - clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish"); - - clReleaseKernelFcnPtr = - (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel"); - - clReleaseProgramFcnPtr = - (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram"); - - clReleaseMemObjectFcnPtr = - (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject"); - - clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( - Handle, "clReleaseCommandQueue"); - - clReleaseContextFcnPtr = - (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext"); - - return 1; -} -#pragma GCC diagnostic pop - -/* Context and Device. */ -static PollyGPUContext *GlobalContext = NULL; -static cl_device_id GlobalDeviceID = NULL; - -/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ -static void printOpenCLError(int Error); - -static void checkOpenCLError(int Ret, const char *format, ...) { - if (Ret == CL_SUCCESS) - return; - - printOpenCLError(Ret); - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - exit(-1); -} - -static PollyGPUContext *initContextCL() { - dump_function(); - - PollyGPUContext *Context; - - cl_platform_id PlatformID = NULL; - cl_device_id DeviceID = NULL; - cl_uint NumDevicesRet; - cl_int Ret; - - char DeviceRevision[256]; - char DeviceName[256]; - size_t DeviceRevisionRetSize, DeviceNameRetSize; - - static __thread PollyGPUContext *CurrentContext = NULL; - - if (CurrentContext) - return CurrentContext; - - /* Get API handles. */ - if (initialDeviceAPIsCL() == 0) { - fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); - exit(-1); - } - - /* Get number of devices that support OpenCL. */ - static const int NumberOfPlatforms = 1; - Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL); - checkOpenCLError(Ret, "Failed to get platform IDs.\n"); - // TODO: Extend to CL_DEVICE_TYPE_ALL? - static const int NumberOfDevices = 1; - Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices, - &DeviceID, &NumDevicesRet); - checkOpenCLError(Ret, "Failed to get device IDs.\n"); - - GlobalDeviceID = DeviceID; - if (NumDevicesRet == 0) { - fprintf(stderr, "There is no device supporting OpenCL.\n"); - exit(-1); - } - - /* Get device revision. */ - Ret = - clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision), - DeviceRevision, &DeviceRevisionRetSize); - checkOpenCLError(Ret, "Failed to fetch device revision.\n"); - - /* Get device name. */ - Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName), - DeviceName, &DeviceNameRetSize); - checkOpenCLError(Ret, "Failed to fetch device name.\n"); - - debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); - - /* Create context on the device. */ - Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); - if (Context == 0) { - fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); - exit(-1); - } - Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); - if (Context->Context == 0) { - fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n"); - exit(-1); - } - ((OpenCLContext *)Context->Context)->Context = - clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret); - checkOpenCLError(Ret, "Failed to create context.\n"); - - static const int ExtraProperties = 0; - ((OpenCLContext *)Context->Context)->CommandQueue = - clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context, - DeviceID, ExtraProperties, &Ret); - checkOpenCLError(Ret, "Failed to create command queue.\n"); - - if (CacheMode) - CurrentContext = Context; - - GlobalContext = Context; - return Context; -} - -static void freeKernelCL(PollyGPUFunction *Kernel) { - dump_function(); - - if (CacheMode) - return; - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - cl_int Ret; - Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); - checkOpenCLError(Ret, "Failed to flush command queue.\n"); - Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); - checkOpenCLError(Ret, "Failed to finish command queue.\n"); - - if (((OpenCLKernel *)Kernel->Kernel)->Kernel) { - cl_int Ret = - clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel); - checkOpenCLError(Ret, "Failed to release kernel.\n"); - } - - if (((OpenCLKernel *)Kernel->Kernel)->Program) { - cl_int Ret = - clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program); - checkOpenCLError(Ret, "Failed to release program.\n"); - } - - if (Kernel->Kernel) - free((OpenCLKernel *)Kernel->Kernel); - - if (Kernel) - free(Kernel); -} - -static PollyGPUFunction *getKernelCL(const char *BinaryBuffer, - const char *KernelName) { - dump_function(); - - if (!GlobalContext) { - fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); - exit(-1); - } - - static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; - static __thread int NextCacheItem = 0; - - for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { - // We exploit here the property that all Polly-ACC kernels are allocated - // as global constants, hence a pointer comparision is sufficient to - // determin equality. - if (KernelCache[i] && - ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString == - BinaryBuffer) { - debug_print(" -> using cached kernel\n"); - return KernelCache[i]; - } - } - - PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); - if (Function == 0) { - fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); - exit(-1); - } - Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); - if (Function->Kernel == 0) { - fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n"); - exit(-1); - } - - if (!GlobalDeviceID) { - fprintf(stderr, "GPGPU-code generation not initialized correctly.\n"); - ex