diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt
--- a/polly/CMakeLists.txt
+++ b/polly/CMakeLists.txt
@@ -85,31 +85,6 @@
 
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 
-option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
-set(GPU_CODEGEN FALSE)
-if (POLLY_ENABLE_GPGPU_CODEGEN)
-  # Do not require CUDA/OpenCL, as GPU code generation test cases can be run
-  # without a CUDA/OpenCL library.
-  if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-    FIND_PACKAGE(CUDA)
-    FIND_PACKAGE(OpenCL)
-    set(GPU_CODEGEN TRUE)
-  else()
-    message(WARNING "The LLVM NVPTX target is required for GPU code generation")
-  endif()
-endif(POLLY_ENABLE_GPGPU_CODEGEN)
-
-
-# Support GPGPU code generation if the library is available.
-if (CUDA_FOUND)
-  add_definitions(-DHAS_LIBCUDART)
-  INCLUDE_DIRECTORIES( ${CUDA_INCLUDE_DIRS} )
-endif(CUDA_FOUND)
-if (OpenCL_FOUND)
-  add_definitions(-DHAS_LIBOPENCL)
-  INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
-endif(OpenCL_FOUND)
-
 option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
 if (NOT POLLY_BUNDLED_ISL)
   find_package(ISL MODULE REQUIRED)
@@ -155,7 +130,6 @@
 if (POLLY_GTEST_AVAIL)
   add_subdirectory(unittests)
 endif ()
-add_subdirectory(tools)
 add_subdirectory(cmake)
 # TODO: docs.
 
diff --git a/polly/cmake/CMakeLists.txt b/polly/cmake/CMakeLists.txt
--- a/polly/cmake/CMakeLists.txt
+++ b/polly/cmake/CMakeLists.txt
@@ -27,9 +27,6 @@
   # LLVMPolly is a dummy target on Win or if PIC code is disabled.
   list(APPEND POLLY_CONFIG_EXPORTED_TARGETS LLVMPolly)
 endif()
-if (POLLY_ENABLE_GPGPU_CODEGEN)
-  list(APPEND POLLY_CONFIG_EXPORTED_TARGETS PollyPPCG)
-endif()
 
 # Get the target type for every exported target
 foreach(tgt IN LISTS POLLY_CONFIG_EXPORTED_TARGETS)
diff --git a/polly/cmake/PollyConfig.cmake.in b/polly/cmake/PollyConfig.cmake.in
--- a/polly/cmake/PollyConfig.cmake.in
+++ b/polly/cmake/PollyConfig.cmake.in
@@ -8,7 +8,6 @@
 
 set(Polly_CMAKE_DIR ${CMAKE_CURRENT_LIST_DIR})
 set(Polly_BUNDLED_ISL @POLLY_BUNDLED_ISL@)
-set(Polly_ENABLE_GPGPU_CODEGEN @POLLY_ENABLE_GPGPU_CODEGEN@)
 
 set(Polly_DEFINITIONS ${LLVM_DEFINITIONS})
 set(Polly_INCLUDE_DIRS @POLLY_CONFIG_INCLUDE_DIRS@ ${LLVM_INCLUDE_DIRS})
@@ -19,17 +18,9 @@
 # Imported Targets:
 @ISL_CONFIG_CODE@
 
-if (Polly_ENABLE_GPGPU_CODEGEN AND NOT TARGET PollyPPCG)
-  add_library(PollyPPCG @POLLY_CONFIG_TARGET_PollyPPCG_TYPE@ IMPORTED)
-  set_property(TARGET PollyPPCG PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
-endif()
-
 if (NOT TARGET Polly)
   add_library(Polly @POLLY_CONFIG_TARGET_Polly_TYPE@ IMPORTED)
   set_property(TARGET Polly PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
-  if (Polly_ENABLE_GPGPU_CODEGEN)
-    set_property(TARGET Polly APPEND PROPERTY INTERFACE_LINK_LIBRARIES PollyPPCG)
-  endif()
 endif()
 
 if (NOT TARGET LLVMPolly)
diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@@ -21,3 +21,5 @@
   In the future we hope that Polly can collaborate better with LoopVectorize,
   like Polly marking a loop is safe to vectorize with a specific simd width,
   instead of replicating its functionality.
+
+- Polly-ACC has been removed.
diff --git a/polly/include/polly/CodeGen/PPCGCodeGeneration.h b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
deleted file mode 100644
--- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Take a scop created by ScopInfo and map it to GPU code using the ppcg
-// GPU mapping strategy.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_PPCGCODEGENERATION_H
-#define POLLY_PPCGCODEGENERATION_H
-
-/// The GPU Architecture to target.
-enum GPUArch { NVPTX64, SPIR32, SPIR64 };
-
-/// The GPU Runtime implementation to use.
-enum GPURuntime { CUDA, OpenCL };
-
-namespace polly {
-extern bool PollyManagedMemory;
-
-/// Use for pass instantiation defaults.
-/// @{
-extern GPURuntime GPURuntimeChoice;
-extern GPUArch GPUArchChoice;
-/// @}
-} // namespace polly
-
-#endif // POLLY_PPCGCODEGENERATION_H
diff --git a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
--- a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
+++ b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
@@ -30,24 +30,20 @@
 struct RuntimeDebugBuilder {
 
   /// Generate a constant string into the builder's llvm::Module which can be
-  /// passed to createGPUPrinter() or createGPUPrinter().
+  /// passed to createCPUPrinter().
   ///
   /// @param Builder The builder used to emit the printer calls.
   /// @param Str     The string to be printed.
 
   /// @return        A global containing @p Str.
   static llvm::Value *getPrintableString(PollyIRBuilder &Builder,
-                                         llvm::StringRef Str) {
-    // TODO: Get rid of magic number 4. It it NVPTX's constant address space and
-    // works on X86 (CPU) only because its backend ignores the address space.
-    return Builder.CreateGlobalStringPtr(Str, "", 4);
-  }
+                                         llvm::StringRef Str);
 
   /// Return whether an llvm::Value of the type @p Ty is printable for
   /// debugging.
   ///
-  /// That is, whether such a value can be passed to createGPUPrinter() or
-  /// createGPUPrinter() to be dumped as runtime.  If false is returned, those
+  /// That is, whether such a value can be passed to createGPUPrinter()
+  /// to be dumped as runtime.  If false is returned, those
   /// functions will fail.
   static bool isPrintable(llvm::Type *Ty);
 
@@ -64,62 +60,41 @@
   template <typename... Args>
   static void createCPUPrinter(PollyIRBuilder &Builder, Args... args) {
     std::vector<llvm::Value *> Vector;
-    createPrinter(Builder, /* CPU */ false, Vector, args...);
-  }
-
-  /// Print a set of LLVM-IR Values or StringRefs on an NVIDIA GPU.
-  ///
-  ///  This function emits a call to vprintf that will print the given
-  ///  arguments from within a kernel thread. It is useful for debugging
-  ///  CUDA program kernels. All arguments given in this list will be
-  ///  automatically concatenated and the resulting string will be printed
-  ///  atomically. We also support ArrayRef arguments, which can be used to
-  ///  provide for example a list of thread-id values.
-  ///
-  ///  @param Builder The builder used to emit the printer calls.
-  ///  @param Args    The list of values to print.
-  template <typename... Args>
-  static void createGPUPrinter(PollyIRBuilder &Builder, Args... args) {
-    std::vector<llvm::Value *> Vector;
-    createPrinter(Builder, /* GPU */ true, Vector, args...);
+    createPrinter(Builder, Vector, args...);
   }
 
 private:
   /// Handle Values.
   template <typename... Args>
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                             std::vector<llvm::Value *> &Values,
                             llvm::Value *Value, Args... args) {
     Values.push_back(Value);
-    createPrinter(Builder, UseGPU, Values, args...);
+    createPrinter(Builder, Values, args...);
   }
 
   /// Handle StringRefs.
   template <typename... Args>
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                             std::vector<llvm::Value *> &Values,
                             llvm::StringRef String, Args... args) {
     Values.push_back(getPrintableString(Builder, String));
-    createPrinter(Builder, UseGPU, Values, args...);
+    createPrinter(Builder, Values, args...);
   }
 
   /// Handle ArrayRefs.
   template <typename... Args>
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                             std::vector<llvm::Value *> &Values,
                             llvm::ArrayRef<llvm::Value *> Array, Args... args) {
     Values.insert(Values.end(), Array.begin(), Array.end());
-    createPrinter(Builder, UseGPU, Values, args...);
+    createPrinter(Builder, Values, args...);
   }
 
   /// Print a list of Values.
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                             llvm::ArrayRef<llvm::Value *> Values);
 
-  /// Print a list of Values on a GPU.
-  static void createGPUPrinterT(PollyIRBuilder &Builder,
-                                llvm::ArrayRef<llvm::Value *> Values);
-
   /// Print a list of Values on a CPU.
   static void createCPUPrinterT(PollyIRBuilder &Builder,
                                 llvm::ArrayRef<llvm::Value *> Values);
@@ -145,22 +120,6 @@
   ///
   /// @parma Builder The builder used to insert the code.
   static void createFlush(PollyIRBuilder &Builder);
-
-  /// Get (and possibly insert) a NVIDIA address space cast call.
-  static llvm::Function *getAddressSpaceCast(PollyIRBuilder &Builder,
-                                             unsigned Src, unsigned Dst,
-                                             unsigned SrcBits = 8,
-                                             unsigned DstBits = 8);
-
-  /// Get identifiers that describe the currently executed GPU thread.
-  ///
-  /// The result will be a vector that if passed to the GPU printer will result
-  /// into a string (initialized to values corresponding to the printing
-  /// thread):
-  ///
-  ///   "> block-id: bidx bid1y bidz | thread-id: tidx tidy tidz "
-  static std::vector<llvm::Value *>
-  getGPUThreadIdentifiers(PollyIRBuilder &Builder);
 };
 } // namespace polly
 
diff --git a/polly/include/polly/Config/config.h.cmake b/polly/include/polly/Config/config.h.cmake
--- a/polly/include/polly/Config/config.h.cmake
+++ b/polly/include/polly/Config/config.h.cmake
@@ -12,7 +12,4 @@
 #ifndef POLLY_CONFIG_H
 #define POLLY_CONFIG_H
 
-#cmakedefine CUDA_FOUND
-#cmakedefine GPU_CODEGEN
-
 #endif
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@@ -14,7 +14,6 @@
 #ifndef POLLY_LINKALLPASSES_H
 #define POLLY_LINKALLPASSES_H
 
-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/Config/config.h"
 #include "polly/Support/DumpFunctionPass.h"
 #include "polly/Support/DumpModulePass.h"
@@ -54,14 +53,6 @@
 llvm::Pass *createIslAstInfoWrapperPassPass();
 llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createCodeGenerationPass();
-#ifdef GPU_CODEGEN
-llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
-                                         GPURuntime Runtime = GPURuntime::CUDA);
-
-llvm::Pass *
-createManagedMemoryRewritePassPass(GPUArch Arch = GPUArch::NVPTX64,
-                                   GPURuntime Runtime = GPURuntime::CUDA);
-#endif
 llvm::Pass *createIslScheduleOptimizerWrapperPass();
 llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createFlattenSchedulePass();
@@ -113,10 +104,6 @@
     polly::createIslAstInfoWrapperPassPass();
     polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
     polly::createCodeGenerationPass();
-#ifdef GPU_CODEGEN
-    polly::createPPCGCodeGenerationPass();
-    polly::createManagedMemoryRewritePassPass();
-#endif
     polly::createIslScheduleOptimizerWrapperPass();
     polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs());
     polly::createMaximalStaticExpansionPass();
@@ -156,10 +143,6 @@
 void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
 void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeCodeGenerationPass(llvm::PassRegistry &);
-#ifdef GPU_CODEGEN
-void initializePPCGCodeGenerationPass(llvm::PassRegistry &);
-void initializeManagedMemoryRewritePassPass(llvm::PassRegistry &);
-#endif
 void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
 void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &);
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -1684,9 +1684,6 @@
   /// Number of copy statements.
   unsigned CopyStmtsNum = 0;
 
-  /// Flag to indicate if the Scop is to be skipped.
-  bool SkipScop = false;
-
   using StmtSet = std::list<ScopStmt>;
 
   /// The statements in this Scop.
@@ -2144,12 +2141,6 @@
   /// Check if the SCoP has been optimized by the scheduler.
   bool isOptimized() const { return IsOptimized; }
 
-  /// Mark the SCoP to be skipped by ScopPass passes.
-  void markAsToBeSkipped() { SkipScop = true; }
-
-  /// Check if the SCoP is to be skipped by ScopPass passes.
-  bool isToBeSkipped() const { return SkipScop; }
-
   /// Return the ID of the Scop
   int getID() const { return ID; }
 
diff --git a/polly/include/polly/Support/LinkGPURuntime.h b/polly/include/polly/Support/LinkGPURuntime.h
deleted file mode 100644
--- a/polly/include/polly/Support/LinkGPURuntime.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===- Support/LinkGPURuntime.h -- Headerfile to help force-link GPURuntime  =//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This header helps pull in libGPURuntime.so
-//
-//===----------------------------------------------------------------------===//
-#ifndef POLLY_LINK_GPURUNTIME
-#define POLLY_LINK_GPURUNTIME
-
-extern "C" {
-#include "GPURuntime/GPUJIT.h"
-}
-
-namespace polly {
-struct ForceGPURuntimeLinking {
-  ForceGPURuntimeLinking() {
-    if (std::getenv("bar") != (char *)-1)
-      return;
-    // We must reference GPURuntime in such a way that compilers will not
-    // delete it all as dead code, even with whole program optimization,
-    // yet is effectively a NO-OP. As the compiler isn't smart enough
-    // to know that getenv() never returns -1, this will do the job.
-    polly_initContextCL();
-    polly_initContextCUDA();
-    polly_getKernel(nullptr, nullptr);
-    polly_freeKernel(nullptr);
-    polly_copyFromHostToDevice(nullptr, nullptr, 0);
-    polly_copyFromDeviceToHost(nullptr, nullptr, 0);
-    polly_synchronizeDevice();
-    polly_launchKernel(nullptr, 0, 0, 0, 0, 0, nullptr);
-    polly_freeDeviceMemory(nullptr);
-    polly_freeContext(nullptr);
-    polly_synchronizeDevice();
-  }
-} structure;
-} // namespace polly
-#endif
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -6,13 +6,6 @@
     CodeGen/IslNodeBuilder.cpp
     CodeGen/CodeGeneration.cpp)
 
-if (GPU_CODEGEN)
-  set (GPGPU_CODEGEN_FILES
-       CodeGen/PPCGCodeGeneration.cpp
-       CodeGen/ManagedMemoryRewrite.cpp
-       )
-endif (GPU_CODEGEN)
-
 # Compile ISL into a separate library.
 add_subdirectory(External)
 
@@ -44,12 +37,6 @@
     Vectorize
 )
 
-# Polly-ACC requires the NVPTX backend to work. Ask LLVM about its libraries.
-if (GPU_CODEGEN)
-  # This call emits an error if they NVPTX backend is not enable.
-  list(APPEND POLLY_COMPONENTS NVPTX)
-endif ()
-
 # Use an object-library to add the same files to multiple libs without requiring
 # the sources them to be recompiled for each of them.
 add_llvm_pass_plugin(Polly
@@ -73,7 +60,6 @@
   CodeGen/Utils.cpp
   CodeGen/RuntimeDebugBuilder.cpp
   CodeGen/PerfMonitor.cpp
-  ${GPGPU_CODEGEN_FILES}
   Exchange/JSONExporter.cpp
   Support/GICHelper.cpp
   Support/SCEVAffinator.cpp
@@ -127,16 +113,6 @@
   ${ISL_TARGET}
 )
 
-# Additional dependencies for Polly-ACC.
-if (GPU_CODEGEN)
-  target_link_libraries(Polly PUBLIC PollyPPCG)
-endif ()
-
-if (NOT LLVM_LINK_LLVM_DYLIB AND NOT LLVM_POLLY_LINK_INTO_TOOLS)
-    # Polly-ACC requires the NVPTX target to be present in the executable it is linked to
-    set_property(TARGET bugpoint APPEND PROPERTY LINK_LIBRARIES LLVMTarget)
-endif ()
-
 # Create a loadable module Polly.so that can be loaded using
 # LLVM's/clang's "-load" option.
 if (WIN32 OR NOT LLVM_ENABLE_PIC)
@@ -150,19 +126,6 @@
     $<TARGET_OBJECTS:obj.Polly>
   )
 
-  # Only add the dependencies that are not part of LLVM. The latter are assumed
-  # to be already available in the address space the module is loaded into.
-  # Adding them once more would have the effect that both copies try to register
-  # the same command line options, to which LLVM reacts with an error.
-  # If Polly-ACC is enabled, the NVPTX target is also expected to reside in the
-  # hosts. This is not the case for bugpoint. Use LLVM_POLLY_LINK_INTO_TOOLS=ON
-  # instead which will automatically resolve the additional dependencies by
-  # Polly.
-  target_link_libraries(LLVMPolly PUBLIC ${ISL_TARGET})
-  if (GPU_CODEGEN)
-    target_link_libraries(LLVMPolly PUBLIC PollyPPCG)
-  endif ()
-
   set_target_properties(LLVMPolly
     PROPERTIES
     LINKER_LANGUAGE CXX
diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -238,14 +238,8 @@
   Builder.Insert(NewInst);
   BBMap[Inst] = NewInst;
 
-  // When copying the instruction onto the Module meant for the GPU,
-  // debug metadata attached to an instruction causes all related
-  // metadata to be pulled into the Module. This includes the DICompileUnit,
-  // which will not be listed in llvm.dbg.cu of the Module since the Module
-  // doesn't contain one. This fails the verification of the Module and the
-  // subsequent generation of the ASM string.
-  if (NewInst->getModule() != Inst->getModule())
-    NewInst->setDebugLoc(llvm::DebugLoc());
+  assert(NewInst->getModule() == Inst->getModule() &&
+         "Expecting instructions to be in the same module");
 
   if (!NewInst->getType()->isVoidTy())
     NewInst->setName("p_" + Inst->getName());
diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@@ -323,10 +323,6 @@
 
   /// Generate LLVM-IR for the SCoP @p S.
   bool runOnScop(Scop &S) override {
-    // Skip SCoPs in case they're already code-generated by PPCGCodeGeneration.
-    if (S.isToBeSkipped())
-      return false;
-
     AI = &getAnalysis<IslAstInfoWrapperPass>().getAI();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp
--- a/polly/lib/CodeGen/IslAst.cpp
+++ b/polly/lib/CodeGen/IslAst.cpp
@@ -638,10 +638,6 @@
 static std::unique_ptr<IslAstInfo> runIslAst(
     Scop &Scop,
     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps) {
-  // Skip SCoPs in case they're already handled by PPCGCodeGeneration.
-  if (Scop.isToBeSkipped())
-    return {};
-
   ScopsProcessed++;
 
   const Dependences &D = GetDeps(Dependences::AL_Statement);
diff --git a/polly/lib/CodeGen/ManagedMemoryRewrite.cpp b/polly/lib/CodeGen/ManagedMemoryRewrite.cpp
deleted file mode 100644
--- a/polly/lib/CodeGen/ManagedMemoryRewrite.cpp
+++ /dev/null
@@ -1,427 +0,0 @@
-//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Take a module and rewrite:
-// 1. `malloc` -> `polly_mallocManaged`
-// 2. `free` -> `polly_freeManaged`
-// 3. global arrays with initializers -> global arrays that are initialized
-//                                       with a constructor call to
-//                                       `polly_mallocManaged`.
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/CodeGen/IRBuilder.h"
-#include "polly/CodeGen/PPCGCodeGeneration.h"
-#include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
-#include "polly/Options.h"
-#include "polly/ScopDetection.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-using namespace polly;
-
-static cl::opt<bool> RewriteAllocas(
-    "polly-acc-rewrite-allocas",
-    cl::desc(
-        "Ask the managed memory rewriter to also rewrite alloca instructions"),
-    cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<bool> IgnoreLinkageForGlobals(
-    "polly-acc-rewrite-ignore-linkage-for-globals",
-    cl::desc(
-        "By default, we only rewrite globals with internal linkage. This flag "
-        "enables rewriting of globals regardless of linkage"),
-    cl::Hidden, cl::cat(PollyCategory));
-
-#define DEBUG_TYPE "polly-acc-rewrite-managed-memory"
-namespace {
-
-static llvm::Function *getOrCreatePollyMallocManaged(Module &M) {
-  const char *Name = "polly_mallocManaged";
-  Function *F = M.getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    PollyIRBuilder Builder(M.getContext());
-    // TODO: How do I get `size_t`? I assume from DataLayout?
-    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(),
-                                         {Builder.getInt64Ty()}, false);
-    F = Function::Create(Ty, Linkage, Name, &M);
-  }
-
-  return F;
-}
-
-static llvm::Function *getOrCreatePollyFreeManaged(Module &M) {
-  const char *Name = "polly_freeManaged";
-  Function *F = M.getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    PollyIRBuilder Builder(M.getContext());
-    // TODO: How do I get `size_t`? I assume from DataLayout?
-    FunctionType *Ty =
-        FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);
-    F = Function::Create(Ty, Linkage, Name, &M);
-  }
-
-  return F;
-}
-
-// Expand a constant expression `Cur`, which is used at instruction `Parent`
-// at index `index`.
-// Since a constant expression can expand to multiple instructions, store all
-// the expands into a set called `Expands`.
-// Note that this goes inorder on the constant expression tree.
-// A * ((B * D) + C)
-// will be processed with first A, then B * D, then B, then D, and then C.
-// Though ConstantExprs are not treated as "trees" but as DAGs, since you can
-// have something like this:
-//    *
-//   /  \
-//   \  /
-//    (D)
-//
-// For the purposes of this expansion, we expand the two occurences of D
-// separately. Therefore, we expand the DAG into the tree:
-//  *
-// / \
-// D  D
-// TODO: We don't _have_to do this, but this is the simplest solution.
-// We can write a solution that keeps track of which constants have been
-// already expanded.
-static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder,
-                               Instruction *Parent, int index,
-                               SmallPtrSet<Instruction *, 4> &Expands) {
-  assert(Cur && "invalid constant expression passed");
-  Instruction *I = Cur->getAsInstruction();
-  assert(I && "unable to convert ConstantExpr to Instruction");
-
-  LLVM_DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur
-                    << ") in Instruction: (" << *I << ")\n";);
-
-  // Invalidate `Cur` so that no one after this point uses `Cur`. Rather,
-  // they should mutate `I`.
-  Cur = nullptr;
-
-  Expands.insert(I);
-  Parent->setOperand(index, I);
-
-  // The things that `Parent` uses (its operands) should be created
-  // before `Parent`.
-  Builder.SetInsertPoint(Parent);
-  Builder.Insert(I);
-
-  for (unsigned i = 0; i < I->getNumOperands(); i++) {
-    Value *Op = I->getOperand(i);
-    assert(isa<Constant>(Op) && "constant must have a constant operand");
-
-    if (ConstantExpr *CExprOp = dyn_cast<ConstantExpr>(Op))
-      expandConstantExpr(CExprOp, Builder, I, i, Expands);
-  }
-}
-
-// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite
-// `ConstantExpr`s that are used in the `Inst`.
-// Note that `replaceAllUsesWith` is insufficient for this purpose because it
-// does not rewrite values in `ConstantExpr`s.
-static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal,
-                               PollyIRBuilder &Builder) {
-
-  // This contains a set of instructions in which OldVal must be replaced.
-  // We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s
-  // from `Inst`s arguments.
-  // We need to go through this process because `replaceAllUsesWith` does not
-  // actually edit `ConstantExpr`s.
-  SmallPtrSet<Instruction *, 4> InstsToVisit = {Inst};
-
-  // Expand all `ConstantExpr`s and place it in `InstsToVisit`.
-  for (unsigned i = 0; i < Inst->getNumOperands(); i++) {
-    Value *Operand = Inst->getOperand(i);
-    if (ConstantExpr *ValueConstExpr = dyn_cast<ConstantExpr>(Operand))
-      expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit);
-  }
-
-  // Now visit each instruction and use `replaceUsesOfWith`. We know that
-  // will work because `I` cannot have any `ConstantExpr` within it.
-  for (Instruction *I : InstsToVisit)
-    I->replaceUsesOfWith(OldVal, NewVal);
-}
-
-// Given a value `Current`, return all Instructions that may contain `Current`
-// in an expression.
-// We need this auxiliary function, because if we have a
-// `Constant` that is a user of `V`, we need to recurse into the
-// `Constant`s uses to gather the root instruction.
-static void getInstructionUsersOfValue(Value *V,
-                                       SmallVector<Instruction *, 4> &Owners) {
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    Owners.push_back(I);
-  } else {
-    // Anything that is a `User` must be a constant or an instruction.
-    auto *C = cast<Constant>(V);
-    for (Use &CUse : C->uses())
-      getInstructionUsersOfValue(CUse.getUser(), Owners);
-  }
-}
-
-static void
-replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array,
-                   SmallPtrSet<GlobalVariable *, 4> &ReplacedGlobals) {
-  // We only want arrays.
-  ArrayType *ArrayTy = dyn_cast<ArrayType>(Array.getValueType());
-  if (!ArrayTy)
-    return;
-  Type *ElemTy = ArrayTy->getElementType();
-  PointerType *ElemPtrTy = ElemTy->getPointerTo();
-
-  // We only wish to replace arrays that are visible in the module they
-  // inhabit. Otherwise, our type edit from [T] to T* would be illegal across
-  // modules.
-  const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() ||
-                                       Array.hasInternalLinkage() ||
-                                       IgnoreLinkageForGlobals;
-  if (!OnlyVisibleInsideModule) {
-    LLVM_DEBUG(
-        dbgs() << "Not rewriting (" << Array
-               << ") to managed memory "
-                  "because it could be visible externally. To force rewrite, "
-                  "use -polly-acc-rewrite-ignore-linkage-for-globals.\n");
-    return;
-  }
-
-  if (!Array.hasInitializer() ||
-      !isa<ConstantAggregateZero>(Array.getInitializer())) {
-    LLVM_DEBUG(dbgs() << "Not rewriting (" << Array
-                      << ") to managed memory "
-                         "because it has an initializer which is "
-                         "not a zeroinitializer.\n");
-    return;
-  }
-
-  // At this point, we have committed to replacing this array.
-  ReplacedGlobals.insert(&Array);
-
-  std::string NewName = Array.getName().str();
-  NewName += ".toptr";
-  GlobalVariable *ReplacementToArr =
-      cast<GlobalVariable>(M.getOrInsertGlobal(NewName, ElemPtrTy));
-  ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy));
-
-  Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
-  std::string FnName = Array.getName().str();
-  FnName += ".constructor";
-  PollyIRBuilder Builder(M.getContext());
-  FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
-  const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-  Function *F = Function::Create(Ty, Linkage, FnName, &M);
-  BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F);
-  Builder.SetInsertPoint(Start);
-
-  const uint64_t ArraySizeInt = DL.getTypeAllocSize(ArrayTy);
-  Value *ArraySize = Builder.getInt64(ArraySizeInt);
-  ArraySize->setName("array.size");
-
-  Value *AllocatedMemRaw =
-      Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw");
-  Value *AllocatedMemTyped =
-      Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed");
-  Builder.CreateStore(AllocatedMemTyped, ReplacementToArr);
-  Builder.CreateRetVoid();
-
-  const int Priority = 0;
-  appendToGlobalCtors(M, F, Priority, ReplacementToArr);
-
-  SmallVector<Instruction *, 4> ArrayUserInstructions;
-  // Get all instructions that use array. We need to do this weird thing
-  // because `Constant`s that contain this array neeed to be expanded into
-  // instructions so that we can replace their parameters. `Constant`s cannot
-  // be edited easily, so we choose to convert all `Constant`s to
-  // `Instruction`s and handle all of the uses of `Array` uniformly.
-  for (Use &ArrayUse : Array.uses())
-    getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions);
-
-  for (Instruction *UserOfArrayInst : ArrayUserInstructions) {
-
-    Builder.SetInsertPoint(UserOfArrayInst);
-    // <ty>** -> <ty>*
-    Value *ArrPtrLoaded =
-        Builder.CreateLoad(ElemPtrTy, ReplacementToArr, "arrptr.load");
-    // <ty>* -> [ty]*
-    Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast(
-        ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast");
-    rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder);
-  }
-}
-
-// We return all `allocas` that may need to be converted to a call to
-// cudaMallocManaged.
-static void getAllocasToBeManaged(Function &F,
-                                  SmallSet<AllocaInst *, 4> &Allocas) {
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      auto *Alloca = dyn_cast<AllocaInst>(&I);
-      if (!Alloca)
-        continue;
-      LLVM_DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: ");
-
-      if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false,
-                               /* StoreCaptures */ true)) {
-        Allocas.insert(Alloca);
-        LLVM_DEBUG(dbgs() << "YES (captured).\n");
-      } else {
-        LLVM_DEBUG(dbgs() << "NO (not captured).\n");
-      }
-    }
-  }
-}
-
-static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca,
-                                         const DataLayout &DL) {
-  LLVM_DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n");
-  Module *M = Alloca->getModule();
-  assert(M && "Alloca does not have a module");
-
-  PollyIRBuilder Builder(M->getContext());
-  Builder.SetInsertPoint(Alloca);
-
-  Function *MallocManagedFn =
-      getOrCreatePollyMallocManaged(*Alloca->getModule());
-  const uint64_t Size = DL.getTypeAllocSize(Alloca->getAllocatedType());
-  Value *SizeVal = Builder.getInt64(Size);
-  Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal});
-  Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType());
-
-  Function *F = Alloca->getFunction();
-  assert(F && "Alloca has invalid function");
-
-  Bitcasted->takeName(Alloca);
-  Alloca->replaceAllUsesWith(Bitcasted);
-  Alloca->eraseFromParent();
-
-  for (BasicBlock &BB : *F) {
-    ReturnInst *Return = dyn_cast<ReturnInst>(BB.getTerminator());
-    if (!Return)
-      continue;
-    Builder.SetInsertPoint(Return);
-
-    Function *FreeManagedFn = getOrCreatePollyFreeManaged(*M);
-    Builder.CreateCall(FreeManagedFn, {RawManagedMem});
-  }
-}
-
-// Replace all uses of `Old` with `New`, even inside `ConstantExpr`.
-//
-// `replaceAllUsesWith` does replace values in `ConstantExpr`. This function
-// actually does replace it in `ConstantExpr`. The caveat is that if there is
-// a use that is *outside* a function (say, at global declarations), we fail.
-// So, this is meant to be used on values which we know will only be used
-// within functions.
-//
-// This process works by looking through the uses of `Old`. If it finds a
-// `ConstantExpr`, it recursively looks for the owning instruction.
-// Then, it expands all the `ConstantExpr` to instructions and replaces
-// `Old` with `New` in the expanded instructions.
-static void replaceAllUsesAndConstantUses(Value *Old, Value *New,
-                                          PollyIRBuilder &Builder) {
-  SmallVector<Instruction *, 4> UserInstructions;
-  // Get all instructions that use array. We need to do this weird thing
-  // because `Constant`s that contain this array neeed to be expanded into
-  // instructions so that we can replace their parameters. `Constant`s cannot
-  // be edited easily, so we choose to convert all `Constant`s to
-  // `Instruction`s and handle all of the uses of `Array` uniformly.
-  for (Use &ArrayUse : Old->uses())
-    getInstructionUsersOfValue(ArrayUse.getUser(), UserInstructions);
-
-  for (Instruction *I : UserInstructions)
-    rewriteOldValToNew(I, Old, New, Builder);
-}
-
-class ManagedMemoryRewritePass final : public ModulePass {
-public:
-  static char ID;
-  GPUArch Architecture;
-  GPURuntime Runtime;
-
-  ManagedMemoryRewritePass() : ModulePass(ID) {}
-  bool runOnModule(Module &M) override {
-    const DataLayout &DL = M.getDataLayout();
-
-    Function *Malloc = M.getFunction("malloc");
-
-    if (Malloc) {
-      PollyIRBuilder Builder(M.getContext());
-      Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
-      assert(PollyMallocManaged && "unable to create polly_mallocManaged");
-
-      replaceAllUsesAndConstantUses(Malloc, PollyMallocManaged, Builder);
-      Malloc->eraseFromParent();
-    }
-
-    Function *Free = M.getFunction("free");
-
-    if (Free) {
-      PollyIRBuilder Builder(M.getContext());
-      Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M);
-      assert(PollyFreeManaged && "unable to create polly_freeManaged");
-
-      replaceAllUsesAndConstantUses(Free, PollyFreeManaged, Builder);
-      Free->eraseFromParent();
-    }
-
-    SmallPtrSet<GlobalVariable *, 4> GlobalsToErase;
-    for (GlobalVariable &Global : M.globals())
-      replaceGlobalArray(M, DL, Global, GlobalsToErase);
-    for (GlobalVariable *G : GlobalsToErase)
-      G->eraseFromParent();
-
-    // Rewrite allocas to cudaMallocs if we are asked to do so.
-    if (RewriteAllocas) {
-      SmallSet<AllocaInst *, 4> AllocasToBeManaged;
-      for (Function &F : M.functions())
-        getAllocasToBeManaged(F, AllocasToBeManaged);
-
-      for (AllocaInst *Alloca : AllocasToBeManaged)
-        rewriteAllocaAsManagedMemory(Alloca, DL);
-    }
-
-    return true;
-  }
-};
-} // namespace
-char ManagedMemoryRewritePass::ID = 42;
-
-Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch,
-                                                GPURuntime Runtime) {
-  ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass();
-  pass->Runtime = Runtime;
-  pass->Architecture = Arch;
-  return pass;
-}
-
-INITIALIZE_PASS_BEGIN(
-    ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
-    "Polly - Rewrite all allocations in heap & data section to managed memory",
-    false, false)
-INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
-INITIALIZE_PASS_END(
-    ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
-    "Polly - Rewrite all allocations in heap & data section to managed memory",
-    false, false)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
deleted file mode 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ /dev/null
@@ -1,3657 +0,0 @@
-//===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Take a scop created by ScopInfo and map it to GPU code using the ppcg
-// GPU mapping strategy.
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/CodeGen/PPCGCodeGeneration.h"
-#include "polly/CodeGen/CodeGeneration.h"
-#include "polly/CodeGen/IslAst.h"
-#include "polly/CodeGen/IslNodeBuilder.h"
-#include "polly/CodeGen/PerfMonitor.h"
-#include "polly/CodeGen/Utils.h"
-#include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
-#include "polly/Options.h"
-#include "polly/ScopDetection.h"
-#include "polly/ScopInfo.h"
-#include "polly/Support/ISLTools.h"
-#include "polly/Support/SCEVValidator.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "isl/union_map.h"
-#include <algorithm>
-
-extern "C" {
-#include "ppcg/cuda.h"
-#include "ppcg/gpu.h"
-#include "ppcg/ppcg.h"
-}
-
-#include "llvm/Support/Debug.h"
-
-using namespace polly;
-using namespace llvm;
-
-#define DEBUG_TYPE "polly-codegen-ppcg"
-
-static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
-                                  cl::desc("Dump the computed GPU Schedule"),
-                                  cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<bool>
-    DumpCode("polly-acc-dump-code",
-             cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
-             cl::cat(PollyCategory));
-
-static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
-                                  cl::desc("Dump the kernel LLVM-IR"),
-                                  cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
-                                   cl::desc("Dump the kernel assembly code"),
-                                   cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<bool> FastMath("polly-acc-fastmath",
-                              cl::desc("Allow unsafe math optimizations"),
-                              cl::Hidden, cl::cat(PollyCategory));
-static cl::opt<bool> SharedMemory("polly-acc-use-shared",
-                                  cl::desc("Use shared memory"), cl::Hidden,
-                                  cl::cat(PollyCategory));
-static cl::opt<bool> PrivateMemory("polly-acc-use-private",
-                                   cl::desc("Use private memory"), cl::Hidden,
-                                   cl::cat(PollyCategory));
-
-bool polly::PollyManagedMemory;
-static cl::opt<bool, true>
-    XManagedMemory("polly-acc-codegen-managed-memory",
-                   cl::desc("Generate Host kernel code assuming"
-                            " that all memory has been"
-                            " declared as managed memory"),
-                   cl::location(PollyManagedMemory), cl::Hidden,
-                   cl::init(false), cl::cat(PollyCategory));
-
-static cl::opt<bool>
-    FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure",
-                              cl::desc("Fail and generate a backtrace if"
-                                       " verifyModule fails on the GPU "
-                                       " kernel module."),
-                              cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<std::string> CUDALibDevice(
-    "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden,
-    cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"),
-    cl::cat(PollyCategory));
-
-static cl::opt<std::string>
-    CudaVersion("polly-acc-cuda-version",
-                cl::desc("The CUDA version to compile for"), cl::Hidden,
-                cl::init("sm_30"), cl::cat(PollyCategory));
-
-static cl::opt<int>
-    MinCompute("polly-acc-mincompute",
-               cl::desc("Minimal number of compute statements to run on GPU."),
-               cl::Hidden, cl::init(10 * 512 * 512));
-
-GPURuntime polly::GPURuntimeChoice;
-static cl::opt<GPURuntime, true>
-    XGPURuntimeChoice("polly-gpu-runtime",
-                      cl::desc("The GPU Runtime API to target"),
-                      cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
-                                            "use the CUDA Runtime API"),
-                                 clEnumValN(GPURuntime::OpenCL, "libopencl",
-                                            "use the OpenCL Runtime API")),
-                      cl::location(polly::GPURuntimeChoice),
-                      cl::init(GPURuntime::CUDA), cl::cat(PollyCategory));
-
-GPUArch polly::GPUArchChoice;
-static cl::opt<GPUArch, true>
-    XGPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
-                   cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
-                                         "target NVIDIA 64-bit architecture"),
-                              clEnumValN(GPUArch::SPIR32, "spir32",
-                                         "target SPIR 32-bit architecture"),
-                              clEnumValN(GPUArch::SPIR64, "spir64",
-                                         "target SPIR 64-bit architecture")),
-                   cl::location(polly::GPUArchChoice),
-                   cl::init(GPUArch::NVPTX64), cl::cat(PollyCategory));
-
-extern bool polly::PerfMonitoring;
-
-/// Return  a unique name for a Scop, which is the scop region with the
-/// function name.
-std::string getUniqueScopName(const Scop *S) {
-  return "Scop Region: " + S->getNameStr() +
-         " | Function: " + std::string(S->getFunction().getName());
-}
-
-/// Used to store information PPCG wants for kills. This information is
-/// used by live range reordering.
-///
-/// @see computeLiveRangeReordering
-/// @see GPUNodeBuilder::createPPCGScop
-/// @see GPUNodeBuilder::createPPCGProg
-struct MustKillsInfo {
-  /// Collection of all kill statements that will be sequenced at the end of
-  /// PPCGScop->schedule.
-  ///
-  /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set`
-  /// which merges schedules in *arbitrary* order.
-  /// (we don't care about the order of the kills anyway).
-  isl::schedule KillsSchedule;
-  /// Map from kill statement instances to scalars that need to be
-  /// killed.
-  ///
-  /// We currently derive kill information for:
-  ///  1. phi nodes. PHI nodes are not alive outside the scop and can
-  ///     consequently all be killed.
-  ///  2. Scalar arrays that are not used outside the Scop. This is
-  ///     checked by `isScalarUsesContainedInScop`.
-  /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
-  isl::union_map TaggedMustKills;
-
-  /// Tagged must kills stripped of the tags.
-  /// [params] -> { Stmt_phantom[]  -> scalar_to_kill[] }
-  isl::union_map MustKills;
-
-  MustKillsInfo() : KillsSchedule() {}
-};
-
-/// Check if SAI's uses are entirely contained within Scop S.
-/// If a scalar is used only with a Scop, we are free to kill it, as no data
-/// can flow in/out of the value any more.
-/// @see computeMustKillsInfo
-static bool isScalarUsesContainedInScop(const Scop &S,
-                                        const ScopArrayInfo *SAI) {
-  assert(SAI->isValueKind() && "this function only deals with scalars."
-                               " Dealing with arrays required alias analysis");
-
-  const Region &R = S.getRegion();
-  for (User *U : SAI->getBasePtr()->users()) {
-    Instruction *I = dyn_cast<Instruction>(U);
-    assert(I && "invalid user of scop array info");
-    if (!R.contains(I))
-      return false;
-  }
-  return true;
-}
-
-/// Compute must-kills needed to enable live range reordering with PPCG.
-///
-/// @params S The Scop to compute live range reordering information
-/// @returns live range reordering information that can be used to setup
-/// PPCG.
-static MustKillsInfo computeMustKillsInfo(const Scop &S) {
-  const isl::space ParamSpace = S.getParamSpace();
-  MustKillsInfo Info;
-
-  // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria:
-  //      1.1 phi nodes in scop.
-  //      1.2 scalars that are only used within the scop
-  SmallVector<isl::id, 4> KillMemIds;
-  for (ScopArrayInfo *SAI : S.arrays()) {
-    if (SAI->isPHIKind() ||
-        (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI)))
-      KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release()));
-  }
-
-  Info.TaggedMustKills = isl::union_map::empty(ParamSpace.ctx());
-  Info.MustKills = isl::union_map::empty(ParamSpace.ctx());
-
-  // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
-  // schedule:
-  //     - filter: "[control] -> { }"
-  // So, we choose to not create this to keep the output a little nicer,
-  // at the cost of some code complexity.
-  Info.KillsSchedule = {};
-
-  for (isl::id &ToKillId : KillMemIds) {
-    isl::id KillStmtId = isl::id::alloc(
-        S.getIslCtx(),
-        std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr);
-
-    // NOTE: construction of tagged_must_kill:
-    // 2. We need to construct a map:
-    //     [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
-    // To construct this, we use `isl_map_domain_product` on 2 maps`:
-    // 2a. StmtToScalar:
-    //         [param] -> { Stmt_phantom[] -> scalar_to_kill[] }
-    // 2b. PhantomRefToScalar:
-    //         [param] -> { ref_phantom[] -> scalar_to_kill[] }
-    //
-    // Combining these with `isl_map_domain_product` gives us
-    // TaggedMustKill:
-    //     [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
-
-    // 2a. [param] -> { Stmt[] -> scalar_to_kill[] }
-    isl::map StmtToScalar = isl::map::universe(ParamSpace);
-    StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId));
-    StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId));
-
-    isl::id PhantomRefId = isl::id::alloc(
-        S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(),
-        nullptr);
-
-    // 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] }
-    isl::map PhantomRefToScalar = isl::map::universe(ParamSpace);
-    PhantomRefToScalar =
-        PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId);
-    PhantomRefToScalar =
-        PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId);
-
-    // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
-    isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar);
-    Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
-
-    // 2. [param] -> { Stmt[] -> scalar_to_kill[] }
-    Info.MustKills = Info.TaggedMustKills.domain_factor_domain();
-
-    // 3. Create the kill schedule of the form:
-    //     "[param] -> { Stmt_phantom[] }"
-    // Then add this to Info.KillsSchedule.
-    isl::space KillStmtSpace = ParamSpace;
-    KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId);
-    isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace);
-
-    isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain);
-    if (!Info.KillsSchedule.is_null())
-      Info.KillsSchedule = isl::manage(
-          isl_schedule_set(Info.KillsSchedule.release(), KillSchedule.copy()));
-    else
-      Info.KillsSchedule = KillSchedule;
-  }
-
-  return Info;
-}
-
-/// Create the ast expressions for a ScopStmt.
-///
-/// This function is a callback for to generate the ast expressions for each
-/// of the scheduled ScopStmts.
-static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
-    void *StmtT, __isl_take isl_ast_build *Build_C,
-    isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
-                                       isl_id *Id, void *User),
-    void *UserIndex,
-    isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
-    void *UserExpr) {
-
-  ScopStmt *Stmt = (ScopStmt *)StmtT;
-
-  if (!Stmt || !Build_C)
-    return NULL;
-
-  isl::ast_build Build = isl::manage_copy(Build_C);
-  isl::ctx Ctx = Build.ctx();
-  isl::id_to_ast_expr RefToExpr = isl::id_to_ast_expr::alloc(Ctx, 0);
-
-  Stmt->setAstBuild(Build);
-
-  for (MemoryAccess *Acc : *Stmt) {
-    isl::map AddrFunc = Acc->getAddressFunction();
-    AddrFunc = AddrFunc.intersect_domain(Stmt->getDomain());
-
-    isl::id RefId = Acc->getId();
-    isl::pw_multi_aff PMA = isl::pw_multi_aff::from_map(AddrFunc);
-
-    isl::multi_pw_aff MPA = isl::multi_pw_aff(PMA);
-    MPA = MPA.coalesce();
-    MPA = isl::manage(FunctionIndex(MPA.release(), RefId.get(), UserIndex));
-
-    isl::ast_expr Access = Build.access_from(MPA);
-    Access = isl::manage(FunctionExpr(Access.release(), RefId.get(), UserExpr));
-    RefToExpr = RefToExpr.set(RefId, Access);
-  }
-
-  return RefToExpr.release();
-}
-
-/// Given a LLVM Type, compute its size in bytes,
-static int computeSizeInBytes(const Type *T) {
-  int bytes = T->getPrimitiveSizeInBits() / 8;
-  if (bytes == 0)
-    bytes = T->getScalarSizeInBits() / 8;
-  return bytes;
-}
-
-/// Generate code for a GPU specific isl AST.
-///
-/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
-/// generates code for general-purpose AST nodes, with special functionality
-/// for generating GPU specific user nodes.
-///
-/// @see GPUNodeBuilder::createUser
-class GPUNodeBuilder final : public IslNodeBuilder {
-public:
-  GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
-                 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
-                 DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
-                 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
-      : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
-        Prog(Prog), Runtime(Runtime), Arch(Arch) {
-    getExprBuilder().setIDToSAI(&IDToSAI);
-  }
-
-  /// Create after-run-time-check initialization code.
-  void initializeAfterRTH();
-
-  /// Finalize the generated scop.
-  void finalize() override;
-
-  /// Track if the full build process was successful.
-  ///
-  /// This value is set to false, if throughout the build process an error
-  /// occurred which prevents us from generating valid GPU code.
-  bool BuildSuccessful = true;
-
-  /// The maximal number of loops surrounding a sequential kernel.
-  unsigned DeepestSequential = 0;
-
-  /// The maximal number of loops surrounding a parallel kernel.
-  unsigned DeepestParallel = 0;
-
-  /// Return the name to set for the ptx_kernel.
-  std::string getKernelFuncName(int Kernel_id);
-
-private:
-  /// A vector of array base pointers for which a new ScopArrayInfo was created.
-  ///
-  /// This vector is used to delete the ScopArrayInfo when it is not needed any
-  /// more.
-  std::vector<Value *> LocalArrays;
-
-  /// A map from ScopArrays to their corresponding device allocations.
-  std::map<ScopArrayInfo *, Value *> DeviceAllocations;
-
-  /// The current GPU context.
-  Value *GPUContext;
-
-  /// The set of isl_ids allocated in the kernel
-  std::vector<isl_id *> KernelIds;
-
-  /// A module containing GPU code.
-  ///
-  /// This pointer is only set in case we are currently generating GPU code.
-  std::unique_ptr<Module> GPUModule;
-
-  /// The GPU program we generate code for.
-  gpu_prog *Prog;
-
-  /// The GPU Runtime implementation to use (OpenCL or CUDA).
-  GPURuntime Runtime;
-
-  /// The GPU Architecture to target.
-  GPUArch Arch;
-
-  /// Class to free isl_ids.
-  class IslIdDeleter final {
-  public:
-    void operator()(__isl_take isl_id *Id) { isl_id_free(Id); };
-  };
-
-  /// A set containing all isl_ids allocated in a GPU kernel.
-  ///
-  /// By releasing this set all isl_ids will be freed.
-  std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
-
-  IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
-
-  /// Create code for user-defined AST nodes.
-  ///
-  /// These AST nodes can be of type:
-  ///
-  ///   - ScopStmt:      A computational statement (TODO)
-  ///   - Kernel:        A GPU kernel call (TODO)
-  ///   - Data-Transfer: A GPU <-> CPU data-transfer
-  ///   - In-kernel synchronization
-  ///   - In-kernel memory copy statement
-  ///
-  /// @param UserStmt The ast node to generate code for.
-  void createUser(__isl_take isl_ast_node *UserStmt) override;
-
-  void createFor(__isl_take isl_ast_node *Node) override;
-
-  enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST };
-
-  /// Create code for a data transfer statement
-  ///
-  /// @param TransferStmt The data transfer statement.
-  /// @param Direction The direction in which to transfer data.
-  void createDataTransfer(__isl_take isl_ast_node *TransferStmt,
-                          enum DataDirection Direction);
-
-  /// Find llvm::Values referenced in GPU kernel.
-  ///
-  /// @param Kernel The kernel to scan for llvm::Values
-  ///
-  /// @returns A tuple, whose:
-  ///          - First element contains the set of values referenced by the
-  ///            kernel
-  ///          - Second element contains the set of functions referenced by the
-  ///             kernel. All functions in the set satisfy
-  ///             `isValidFunctionInKernel`.
-  ///          - Third element contains loops that have induction variables
-  ///            which are used in the kernel, *and* these loops are *neither*
-  ///            in the scop, nor do they immediately surroung the Scop.
-  ///            See [Code generation of induction variables of loops outside
-  ///            Scops]
-  std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
-             isl::space>
-  getReferencesInKernel(ppcg_kernel *Kernel);
-
-  /// Compute the sizes of the execution grid for a given kernel.
-  ///
-  /// @param Kernel The kernel to compute grid sizes for.
-  ///
-  /// @returns A tuple with grid sizes for X and Y dimension
-  std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
-
-  /// Get the managed array pointer for sending host pointers to the device.
-  /// \note
-  /// This is to be used only with managed memory
-  Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo);
-
-  /// Compute the sizes of the thread blocks for a given kernel.
-  ///
-  /// @param Kernel The kernel to compute thread block sizes for.
-  ///
-  /// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
-  std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
-
-  /// Store a specific kernel launch parameter in the array of kernel launch
-  /// parameters.
-  ///
-  /// @param ArrayTy    Array type of \p Parameters.
-  /// @param Parameters The list of parameters in which to store.
-  /// @param Param      The kernel launch parameter to store.
-  /// @param Index      The index in the parameter list, at which to store the
-  ///                   parameter.
-  void insertStoreParameter(Type *ArrayTy, Instruction *Parameters,
-                            Instruction *Param, int Index);
-
-  /// Create kernel launch parameters.
-  ///
-  /// @param Kernel        The kernel to create parameters for.
-  /// @param F             The kernel function that has been created.
-  /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
-  ///
-  /// @returns A stack allocated array with pointers to the parameter
-  ///          values that are passed to the kernel.
-  Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F,
-                                SetVector<Value *> SubtreeValues);
-
-  /// Create declarations for kernel variable.
-  ///
-  /// This includes shared memory declarations.
-  ///
-  /// @param Kernel        The kernel definition to create variables for.
-  /// @param FN            The function into which to generate the variables.
-  void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
-
-  /// Add CUDA annotations to module.
-  ///
-  /// Add a set of CUDA annotations that declares the maximal block dimensions
-  /// that will be used to execute the CUDA kernel. This allows the NVIDIA
-  /// PTX compiler to bound the number of allocated registers to ensure the
-  /// resulting kernel is known to run with up to as many block dimensions
-  /// as specified here.
-  ///
-  /// @param M         The module to add the annotations to.
-  /// @param BlockDimX The size of block dimension X.
-  /// @param BlockDimY The size of block dimension Y.
-  /// @param BlockDimZ The size of block dimension Z.
-  void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
-                          Value *BlockDimZ);
-
-  /// Create GPU kernel.
-  ///
-  /// Code generate the kernel described by @p KernelStmt.
-  ///
-  /// @param KernelStmt The ast node to generate kernel code for.
-  void createKernel(__isl_take isl_ast_node *KernelStmt);
-
-  /// Generate code that computes the size of an array.
-  ///
-  /// @param Array The array for which to compute a size.
-  Value *getArraySize(gpu_array_info *Array);
-
-  /// Generate code to compute the minimal offset at which an array is accessed.
-  ///
-  /// The offset of an array is the minimal array location accessed in a scop.
-  ///
-  /// Example:
-  ///
-  ///   for (long i = 0; i < 100; i++)
-  ///     A[i + 42] += ...
-  ///
-  ///   getArrayOffset(A) results in 42.
-  ///
-  /// @param Array The array for which to compute the offset.
-  /// @returns An llvm::Value that contains the offset of the array.
-  Value *getArrayOffset(gpu_array_info *Array);
-
-  /// Prepare the kernel arguments for kernel code generation
-  ///
-  /// @param Kernel The kernel to generate code for.
-  /// @param FN     The function created for the kernel.
-  void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN);
-
-  /// Create kernel function.
-  ///
-  /// Create a kernel function located in a newly created module that can serve
-  /// as target for device code generation. Set the Builder to point to the
-  /// start block of this newly created function.
-  ///
-  /// @param Kernel The kernel to generate code for.
-  /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
-  /// @param SubtreeFunctions The set of llvm::Functions referenced by this
-  ///                         kernel.
-  void createKernelFunction(ppcg_kernel *Kernel,
-                            SetVector<Value *> &SubtreeValues,
-                            SetVector<Function *> &SubtreeFunctions);
-
-  /// Create the declaration of a kernel function.
-  ///
-  /// The kernel function takes as arguments:
-  ///
-  ///   - One i8 pointer for each external array reference used in the kernel.
-  ///   - Host iterators
-  ///   - Parameters
-  ///   - Other LLVM Value references (TODO)
-  ///
-  /// @param Kernel The kernel to generate the function declaration for.
-  /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
-  ///
-  /// @returns The newly declared function.
-  Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
-                                     SetVector<Value *> &SubtreeValues);
-
-  /// Insert intrinsic functions to obtain thread and block ids.
-  ///
-  /// @param The kernel to generate the intrinsic functions for.
-  void insertKernelIntrinsics(ppcg_kernel *Kernel);
-
-  /// Insert function calls to retrieve the SPIR group/local ids.
-  ///
-  /// @param Kernel The kernel to generate the function calls for.
-  /// @param SizeTypeIs64Bit Whether size_t of the openCl device is 64bit.
-  void insertKernelCallsSPIR(ppcg_kernel *Kernel, bool SizeTypeIs64bit);
-
-  /// Setup the creation of functions referenced by the GPU kernel.
-  ///
-  /// 1. Create new function declarations in GPUModule which are the same as
-  /// SubtreeFunctions.
-  ///
-  /// 2. Populate IslNodeBuilder::ValueMap with mappings from
-  /// old functions (that come from the original module) to new functions
-  /// (that are created within GPUModule). That way, we generate references
-  /// to the correct function (in GPUModule) in BlockGenerator.
-  ///
-  /// @see IslNodeBuilder::ValueMap
-  /// @see BlockGenerator::GlobalMap
-  /// @see BlockGenerator::getNewValue
-  /// @see GPUNodeBuilder::getReferencesInKernel.
-  ///
-  /// @param SubtreeFunctions The set of llvm::Functions referenced by
-  ///                         this kernel.
-  void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
-
-  /// Create a global-to-shared or shared-to-global copy statement.
-  ///
-  /// @param CopyStmt The copy statement to generate code for
-  void createKernelCopy(ppcg_kernel_stmt *CopyStmt);
-
-  /// Create code for a ScopStmt called in @p Expr.
-  ///
-  /// @param Expr The expression containing the call.
-  /// @param KernelStmt The kernel statement referenced in the call.
-  void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
-
-  /// Create an in-kernel synchronization call.
-  void createKernelSync();
-
-  /// Create a PTX assembly string for the current GPU kernel.
-  ///
-  /// @returns A string containing the corresponding PTX assembly code.
-  std::string createKernelASM();
-
-  /// Remove references from the dominator tree to the kernel function @p F.
-  ///
-  /// @param F The function to remove references to.
-  void clearDominators(Function *F);
-
-  /// Remove references from scalar evolution to the kernel function @p F.
-  ///
-  /// @param F The function to remove references to.
-  void clearScalarEvolution(Function *F);
-
-  /// Remove references from loop info to the kernel function @p F.
-  ///
-  /// @param F The function to remove references to.
-  void clearLoops(Function *F);
-
-  /// Check if the scop requires to be linked with CUDA's libdevice.
-  bool requiresCUDALibDevice();
-
-  /// Link with the NVIDIA libdevice library (if needed and available).
-  void addCUDALibDevice();
-
-  /// Finalize the generation of the kernel function.
-  ///
-  /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
-  /// dump its IR to stderr.
-  ///
-  /// @returns The Assembly string of the kernel.
-  std::string finalizeKernelFunction();
-
-  /// Finalize the generation of the kernel arguments.
-  ///
-  /// This function ensures that not-read-only scalars used in a kernel are
-  /// stored back to the global memory location they are backed with before
-  /// the kernel terminates.
-  ///
-  /// @params Kernel The kernel to finalize kernel arguments for.
-  void finalizeKernelArguments(ppcg_kernel *Kernel);
-
-  /// Create code that allocates memory to store arrays on device.
-  void allocateDeviceArrays();
-
-  /// Create code to prepare the managed device pointers.
-  void prepareManagedDeviceArrays();
-
-  /// Free all allocated device arrays.
-  void freeDeviceArrays();
-
-  /// Create a call to initialize the GPU context.
-  ///
-  /// @returns A pointer to the newly initialized context.
-  Value *createCallInitContext();
-
-  /// Create a call to get the device pointer for a kernel allocation.
-  ///
-  /// @param Allocation The Polly GPU allocation
-  ///
-  /// @returns The device parameter corresponding to this allocation.
-  Value *createCallGetDevicePtr(Value *Allocation);
-
-  /// Create a call to free the GPU context.
-  ///
-  /// @param Context A pointer to an initialized GPU context.
-  void createCallFreeContext(Value *Context);
-
-  /// Create a call to allocate memory on the device.
-  ///
-  /// @param Size The size of memory to allocate
-  ///
-  /// @returns A pointer that identifies this allocation.
-  Value *createCallAllocateMemoryForDevice(Value *Size);
-
-  /// Create a call to free a device array.
-  ///
-  /// @param Array The device array to free.
-  void createCallFreeDeviceMemory(Value *Array);
-
-  /// Create a call to copy data from host to device.
-  ///
-  /// @param HostPtr A pointer to the host data that should be copied.
-  /// @param DevicePtr A device pointer specifying the location to copy to.
-  void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr,
-                                      Value *Size);
-
-  /// Create a call to copy data from device to host.
-  ///
-  /// @param DevicePtr A pointer to the device data that should be copied.
-  /// @param HostPtr A host pointer specifying the location to copy to.
-  void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
-                                      Value *Size);
-
-  /// Create a call to synchronize Host & Device.
-  /// \note
-  /// This is to be used only with managed memory.
-  void createCallSynchronizeDevice();
-
-  /// Create a call to get a kernel from an assembly string.
-  ///
-  /// @param Buffer The string describing the kernel.
-  /// @param Entry  The name of the kernel function to call.
-  ///
-  /// @returns A pointer to a kernel object
-  Value *createCallGetKernel(Value *Buffer, Value *Entry);
-
-  /// Create a call to free a GPU kernel.
-  ///
-  /// @param GPUKernel THe kernel to free.
-  void createCallFreeKernel(Value *GPUKernel);
-
-  /// Create a call to launch a GPU kernel.
-  ///
-  /// @param GPUKernel  The kernel to launch.
-  /// @param GridDimX   The size of the first grid dimension.
-  /// @param GridDimY   The size of the second grid dimension.
-  /// @param GridBlockX The size of the first block dimension.
-  /// @param GridBlockY The size of the second block dimension.
-  /// @param GridBlockZ The size of the third block dimension.
-  /// @param Parameters A pointer to an array that contains itself pointers to
-  ///                   the parameter values passed for each kernel argument.
-  void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
-                              Value *GridDimY, Value *BlockDimX,
-                              Value *BlockDimY, Value *BlockDimZ,
-                              Value *Parameters);
-};
-
-std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) {
-  return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" +
-         std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id);
-}
-
-void GPUNodeBuilder::initializeAfterRTH() {
-  BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(),
-                                 &*Builder.GetInsertPoint(), &DT, &LI);
-  NewBB->setName("polly.acc.initialize");
-  Builder.SetInsertPoint(&NewBB->front());
-
-  GPUContext = createCallInitContext();
-
-  if (!PollyManagedMemory)
-    allocateDeviceArrays();
-  else
-    prepareManagedDeviceArrays();
-}
-
-void GPUNodeBuilder::finalize() {
-  if (!PollyManagedMemory)
-    freeDeviceArrays();
-
-  createCallFreeContext(GPUContext);
-  IslNodeBuilder::finalize();
-}
-
-void GPUNodeBuilder::allocateDeviceArrays() {
-  assert(!PollyManagedMemory &&
-         "Managed memory will directly send host pointers "
-         "to the kernel. There is no need for device arrays");
-  isl_ast_build *Build = isl_ast_build_from_context(S.getContext().release());
-
-  for (int i = 0; i < Prog->n_array; ++i) {
-    gpu_array_info *Array = &Prog->array[i];
-    auto *ScopArray = (ScopArrayInfo *)Array->user;
-    std::string DevArrayName("p_dev_array_");
-    DevArrayName.append(Array->name);
-
-    Value *ArraySize = getArraySize(Array);
-    Value *Offset = getArrayOffset(Array);
-    if (Offset)
-      ArraySize = Builder.CreateSub(
-          ArraySize,
-          Builder.CreateMul(Offset,
-                            Builder.getInt64(ScopArray->getElemSizeInBytes())));
-    const SCEV *SizeSCEV = SE.getSCEV(ArraySize);
-    // It makes no sense to have an array of size 0. The CUDA API will
-    // throw an error anyway if we invoke `cuMallocManaged` with size `0`. We
-    // choose to be defensive and catch this at the compile phase. It is
-    // most likely that we are doing something wrong with size computation.
-    if (SizeSCEV->isZero()) {
-      errs() << getUniqueScopName(&S)
-             << " has computed array size 0: " << *ArraySize
-             << " | for array: " << *(ScopArray->getBasePtr())
-             << ". This is illegal, exiting.\n";
-      report_fatal_error("array size was computed to be 0");
-    }
-
-    Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
-    DevArray->setName(DevArrayName);
-    DeviceAllocations[ScopArray] = DevArray;
-  }
-
-  isl_ast_build_free(Build);
-}
-
-void GPUNodeBuilder::prepareManagedDeviceArrays() {
-  assert(PollyManagedMemory &&
-         "Device array most only be prepared in managed-memory mode");
-  for (int i = 0; i < Prog->n_array; ++i) {
-    gpu_array_info *Array = &Prog->array[i];
-    ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user;
-    Value *HostPtr;
-
-    if (gpu_array_is_scalar(Array))
-      HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
-    else
-      HostPtr = ScopArray->getBasePtr();
-    HostPtr = getLatestValue(HostPtr);
-
-    Value *Offset = getArrayOffset(Array);
-    if (Offset) {
-      HostPtr = Builder.CreatePointerCast(
-          HostPtr, ScopArray->getElementType()->getPointerTo());
-      HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset);
-    }
-
-    HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
-    DeviceAllocations[ScopArray] = HostPtr;
-  }
-}
-
-void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
-                                        Value *BlockDimY, Value *BlockDimZ) {
-  auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
-
-  for (auto &F : *M) {
-    if (F.getCallingConv() != CallingConv::PTX_Kernel)
-      continue;
-
-    Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
-
-    Metadata *Elements[] = {
-        ValueAsMetadata::get(&F),   MDString::get(M->getContext(), "maxntidx"),
-        ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
-        ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
-        ValueAsMetadata::get(V[2]),
-    };
-    MDNode *Node = MDNode::get(M->getContext(), Elements);
-    AnnotationNode->addOperand(Node);
-  }
-}
-
-void GPUNodeBuilder::freeDeviceArrays() {
-  assert(!PollyManagedMemory && "Managed memory does not use device arrays");
-  for (auto &Array : DeviceAllocations)
-    createCallFreeDeviceMemory(Array.second);
-}
-
-Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
-  const char *Name = "polly_getKernel";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    Args.push_back(Builder.getInt8PtrTy());
-    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  return Builder.CreateCall(F, {Buffer, Entry});
-}
-
-Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
-  const char *Name = "polly_getDevicePtr";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  return Builder.CreateCall(F, {Allocation});
-}
-
-void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
-                                            Value *GridDimY, Value *BlockDimX,
-                                            Value *BlockDimY, Value *BlockDimZ,
-                                            Value *Parameters) {
-  const char *Name = "polly_launchKernel";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    Args.push_back(Builder.getInt32Ty());
-    Args.push_back(Builder.getInt32Ty());
-    Args.push_back(Builder.getInt32Ty());
-    Args.push_back(Builder.getInt32Ty());
-    Args.push_back(Builder.getInt32Ty());
-    Args.push_back(Builder.getInt8PtrTy());
-    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
-                         BlockDimZ, Parameters});
-}
-
-void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
-  const char *Name = "polly_freeKernel";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  Builder.CreateCall(F, {GPUKernel});
-}
-
-void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
-  assert(!PollyManagedMemory &&
-         "Managed memory does not allocate or free memory "
-         "for device");
-  const char *Name = "polly_freeDeviceMemory";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  Builder.CreateCall(F, {Array});
-}
-
-Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
-  assert(!PollyManagedMemory &&
-         "Managed memory does not allocate or free memory "
-         "for device");
-  const char *Name = "polly_allocateMemoryForDevice";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt64Ty());
-    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  return Builder.CreateCall(F, {Size});
-}
-
-void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
-                                                    Value *DeviceData,
-                                                    Value *Size) {
-  assert(!PollyManagedMemory &&
-         "Managed memory does not transfer memory between "
-         "device and host");
-  const char *Name = "polly_copyFromHostToDevice";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    Args.push_back(Builder.getInt8PtrTy());
-    Args.push_back(Builder.getInt64Ty());
-    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  Builder.CreateCall(F, {HostData, DeviceData, Size});
-}
-
-void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
-                                                    Value *HostData,
-                                                    Value *Size) {
-  assert(!PollyManagedMemory &&
-         "Managed memory does not transfer memory between "
-         "device and host");
-  const char *Name = "polly_copyFromDeviceToHost";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    Args.push_back(Builder.getInt8PtrTy());
-    Args.push_back(Builder.getInt64Ty());
-    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  Builder.CreateCall(F, {DeviceData, HostData, Size});
-}
-
-void GPUNodeBuilder::createCallSynchronizeDevice() {
-  assert(PollyManagedMemory && "explicit synchronization is only necessary for "
-                               "managed memory");
-  const char *Name = "polly_synchronizeDevice";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  Builder.CreateCall(F);
-}
-
-Value *GPUNodeBuilder::createCallInitContext() {
-  const char *Name;
-
-  switch (Runtime) {
-  case GPURuntime::CUDA:
-    Name = "polly_initContextCUDA";
-    break;
-  case GPURuntime::OpenCL:
-    Name = "polly_initContextCL";
-    break;
-  }
-
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  return Builder.CreateCall(F, {});
-}
-
-void GPUNodeBuilder::createCallFreeContext(Value *Context) {
-  const char *Name = "polly_freeContext";
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *F = M->getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    std::vector<Type *> Args;
-    Args.push_back(Builder.getInt8PtrTy());
-    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  Builder.CreateCall(F, {Context});
-}
-
-/// Check if one string is a prefix of another.
-///
-/// @param String The string in which to look for the prefix.
-/// @param Prefix The prefix to look for.
-static bool isPrefix(std::string String, std::string Prefix) {
-  return String.find(Prefix) == 0;
-}
-
-Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
-  isl::ast_build Build = isl::ast_build::from_context(S.getContext());
-  Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
-
-  if (!gpu_array_is_scalar(Array)) {
-    isl::multi_pw_aff ArrayBound = isl::manage_copy(Array->bound);
-
-    isl::pw_aff OffsetDimZero = ArrayBound.at(0);
-    isl::ast_expr Res = Build.expr_from(OffsetDimZero);
-
-    for (unsigned int i = 1; i < Array->n_index; i++) {
-      isl::pw_aff Bound_I = ArrayBound.at(i);
-      isl::ast_expr Expr = Build.expr_from(Bound_I);
-      Res = Res.mul(Expr);
-    }
-
-    Value *NumElements = ExprBuilder.create(Res.release());
-    if (NumElements->getType() != ArraySize->getType())
-      NumElements = Builder.CreateSExt(NumElements, ArraySize->getType());
-    ArraySize = Builder.CreateMul(ArraySize, NumElements);
-  }
-  return ArraySize;
-}
-
-Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
-  if (gpu_array_is_scalar(Array))
-    return nullptr;
-
-  isl::ast_build Build = isl::ast_build::from_context(S.getContext());
-
-  isl::set Min = isl::manage_copy(Array->extent).lexmin();
-
-  isl::set ZeroSet = isl::set::universe(Min.get_space());
-
-  for (unsigned i : rangeIslSize(0, Min.tuple_dim()))
-    ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0);
-
-  if (Min.is_subset(ZeroSet)) {
-    return nullptr;
-  }
-
-  isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.ctx(), 0));
-
-  for (unsigned i : rangeIslSize(0, Min.tuple_dim())) {
-    if (i > 0) {
-      isl::pw_aff Bound_I =
-          isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1));
-      isl::ast_expr BExpr = Build.expr_from(Bound_I);
-      Result = Result.mul(BExpr);
-    }
-    isl::pw_aff DimMin = Min.dim_min(i);
-    isl::ast_expr MExpr = Build.expr_from(DimMin);
-    Result = Result.add(MExpr);
-  }
-
-  return ExprBuilder.create(Result.release());
-}
-
-Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array,
-                                             ScopArrayInfo *ArrayInfo) {
-  assert(PollyManagedMemory && "Only used when you wish to get a host "
-                               "pointer for sending data to the kernel, "
-                               "with managed memory");
-  std::map<ScopArrayInfo *, Value *>::iterator it;
-  it = DeviceAllocations.find(ArrayInfo);
-  assert(it != DeviceAllocations.end() &&
-         "Device array expected to be available");
-  return it->second;
-}
-
-void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
-                                        enum DataDirection Direction) {
-  assert(!PollyManagedMemory && "Managed memory needs no data transfers");
-  isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
-  isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
-  isl_id *Id = isl_ast_expr_get_id(Arg);
-  auto Array = (gpu_array_info *)isl_id_get_user(Id);
-  auto ScopArray = (ScopArrayInfo *)(Array->user);
-
-  Value *Size = getArraySize(Array);
-  Value *Offset = getArrayOffset(Array);
-  Value *DevPtr = DeviceAllocations[ScopArray];
-
-  Value *HostPtr;
-
-  if (gpu_array_is_scalar(Array))
-    HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
-  else
-    HostPtr = ScopArray->getBasePtr();
-  HostPtr = getLatestValue(HostPtr);
-
-  if (Offset) {
-    HostPtr = Builder.CreatePointerCast(
-        HostPtr, ScopArray->getElementType()->getPointerTo());
-    HostPtr = Builder.CreateGEP(ScopArray->getElementType(), HostPtr, Offset);
-  }
-
-  HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
-
-  if (Offset) {
-    Size = Builder.CreateSub(
-        Size, Builder.CreateMul(
-                  Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
-  }
-
-  if (Direction == HOST_TO_DEVICE)
-    createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
-  else
-    createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size);
-
-  isl_id_free(Id);
-  isl_ast_expr_free(Arg);
-  isl_ast_expr_free(Expr);
-  isl_ast_node_free(TransferStmt);
-}
-
-void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
-  isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
-  isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
-  isl_id *Id = isl_ast_expr_get_id(StmtExpr);
-  isl_id_free(Id);
-  isl_ast_expr_free(StmtExpr);
-
-  const char *Str = isl_id_get_name(Id);
-  if (!strcmp(Str, "kernel")) {
-    createKernel(UserStmt);
-    if (PollyManagedMemory)
-      createCallSynchronizeDevice();
-    isl_ast_expr_free(Expr);
-    return;
-  }
-  if (!strcmp(Str, "init_device")) {
-    initializeAfterRTH();
-    isl_ast_node_free(UserStmt);
-    isl_ast_expr_free(Expr);
-    return;
-  }
-  if (!strcmp(Str, "clear_device")) {
-    finalize();
-    isl_ast_node_free(UserStmt);
-    isl_ast_expr_free(Expr);
-    return;
-  }
-  if (isPrefix(Str, "to_device")) {
-    if (!PollyManagedMemory)
-      createDataTransfer(UserStmt, HOST_TO_DEVICE);
-    else
-      isl_ast_node_free(UserStmt);
-
-    isl_ast_expr_free(Expr);
-    return;
-  }
-
-  if (isPrefix(Str, "from_device")) {
-    if (!PollyManagedMemory) {
-      createDataTransfer(UserStmt, DEVICE_TO_HOST);
-    } else {
-      isl_ast_node_free(UserStmt);
-    }
-    isl_ast_expr_free(Expr);
-    return;
-  }
-
-  isl_id *Anno = isl_ast_node_get_annotation(UserStmt);
-  struct ppcg_kernel_stmt *KernelStmt =
-      (struct ppcg_kernel_stmt *)isl_id_get_user(Anno);
-  isl_id_free(Anno);
-
-  switch (KernelStmt->type) {
-  case ppcg_kernel_domain:
-    createScopStmt(Expr, KernelStmt);
-    isl_ast_node_free(UserStmt);
-    return;
-  case ppcg_kernel_copy:
-    createKernelCopy(KernelStmt);
-    isl_ast_expr_free(Expr);
-    isl_ast_node_free(UserStmt);
-    return;
-  case ppcg_kernel_sync:
-    createKernelSync();
-    isl_ast_expr_free(Expr);
-    isl_ast_node_free(UserStmt);
-    return;
-  }
-
-  isl_ast_expr_free(Expr);
-  isl_ast_node_free(UserStmt);
-}
-
-void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) {
-  createForSequential(isl::manage(Node).as<isl::ast_node_for>(), false);
-}
-
-void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) {
-  isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index);
-  auto LocalAddr = ExprBuilder.createAccessAddress(LocalIndex);
-  isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index);
-  auto GlobalAddr = ExprBuilder.createAccessAddress(Index);
-
-  if (KernelStmt->u.c.read) {
-    LoadInst *Load =
-        Builder.CreateLoad(GlobalAddr.second, GlobalAddr.first, "shared.read");
-    Builder.CreateStore(Load, LocalAddr.first);
-  } else {
-    LoadInst *Load =
-        Builder.CreateLoad(LocalAddr.second, LocalAddr.first, "shared.write");
-    Builder.CreateStore(Load, GlobalAddr.first);
-  }
-}
-
-void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
-                                    ppcg_kernel_stmt *KernelStmt) {
-  auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
-  isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
-
-  LoopToScevMapT LTS;
-  LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
-
-  createSubstitutions(Expr, Stmt, LTS);
-
-  if (Stmt->isBlockStmt())
-    BlockGen.copyStmt(*Stmt, LTS, Indexes);
-  else
-    RegionGen.copyStmt(*Stmt, LTS, Indexes);
-}
-
-void GPUNodeBuilder::createKernelSync() {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  const char *SpirName = "__gen_ocl_barrier_global";
-
-  Function *Sync;
-
-  switch (Arch) {
-  case GPUArch::SPIR64:
-  case GPUArch::SPIR32:
-    Sync = M->getFunction(SpirName);
-
-    // If Sync is not available, declare it.
-    if (!Sync) {
-      GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-      std::vector<Type *> Args;
-      FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
-      Sync = Function::Create(Ty, Linkage, SpirName, M);
-      Sync->setCallingConv(CallingConv::SPIR_FUNC);
-    }
-    break;
-  case GPUArch::NVPTX64:
-    Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
-    break;
-  }
-
-  Builder.CreateCall(Sync, {});
-}
-
-/// Collect llvm::Values referenced from @p Node
-///
-/// This function only applies to isl_ast_nodes that are user_nodes referring
-/// to a ScopStmt. All other node types are ignore.
-///
-/// @param Node The node to collect references for.
-/// @param User A user pointer used as storage for the data that is collected.
-///
-/// @returns isl_bool_true if data could be collected successfully.
-isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
-  if (isl_ast_node_get_type(Node) != isl_ast_node_user)
-    return isl_bool_true;
-
-  isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
-  isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
-  isl_id *Id = isl_ast_expr_get_id(StmtExpr);
-  const char *Str = isl_id_get_name(Id);
-  isl_id_free(Id);
-  isl_ast_expr_free(StmtExpr);
-  isl_ast_expr_free(Expr);
-
-  if (!isPrefix(Str, "Stmt"))
-    return isl_bool_true;
-
-  Id = isl_ast_node_get_annotation(Node);
-  auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
-  auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
-  isl_id_free(Id);
-
-  addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */);
-
-  return isl_bool_true;
-}
-
-/// A list of functions that are available in NVIDIA's libdevice.
-const std::set<std::string> CUDALibDeviceFunctions = {
-    "exp",      "expf",      "expl",      "cos", "cosf", "sqrt", "sqrtf",
-    "copysign", "copysignf", "copysignl", "log", "logf", "powi", "powif"};
-
-// A map from intrinsics to their corresponding libdevice functions.
-const std::map<std::string, std::string> IntrinsicToLibdeviceFunc = {
-    {"llvm.exp.f64", "exp"},
-    {"llvm.exp.f32", "expf"},
-    {"llvm.powi.f64.i32", "powi"},
-    {"llvm.powi.f32.i32", "powif"}};
-
-/// Return the corresponding CUDA libdevice function name @p Name.
-/// Note that this function will try to convert instrinsics in the list
-/// IntrinsicToLibdeviceFunc into libdevice functions.
-/// This is because some intrinsics such as `exp`
-/// are not supported by the NVPTX backend.
-/// If this restriction of the backend is lifted, we should refactor our code
-/// so that we use intrinsics whenever possible.
-///
-/// Return "" if we are not compiling for CUDA.
-std::string getCUDALibDeviceFuntion(StringRef NameRef) {
-  std::string Name = NameRef.str();
-  auto It = IntrinsicToLibdeviceFunc.find(Name);
-  if (It != IntrinsicToLibdeviceFunc.end())
-    return getCUDALibDeviceFuntion(It->second);
-
-  if (CUDALibDeviceFunctions.count(Name))
-    return ("__nv_" + Name);
-
-  return "";
-}
-
-/// Check if F is a function that we can code-generate in a GPU kernel.
-static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) {
-  assert(F && "F is an invalid pointer");
-  // We string compare against the name of the function to allow
-  // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and
-  // "llvm.copysign".
-  const StringRef Name = F->getName();
-
-  if (AllowLibDevice && getCUDALibDeviceFuntion(Name).length() > 0)
-    return true;
-
-  return F->isIntrinsic() &&
-         (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
-          Name.startswith("llvm.copysign"));
-}
-
-/// Do not take `Function` as a subtree value.
-///
-/// We try to take the reference of all subtree values and pass them along
-/// to the kernel from the host. Taking an address of any function and
-/// trying to pass along is nonsensical. Only allow `Value`s that are not
-/// `Function`s.
-static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
-
-/// Return `Function`s from `RawSubtreeValues`.
-static SetVector<Function *>
-getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues,
-                                 bool AllowCUDALibDevice) {
-  SetVector<Function *> SubtreeFunctions;
-  for (Value *It : RawSubtreeValues) {
-    Function *F = dyn_cast<Function>(It);
-    if (F) {
-      assert(isValidFunctionInKernel(F, AllowCUDALibDevice) &&
-             "Code should have bailed out by "
-             "this point if an invalid function "
-             "were present in a kernel.");
-      SubtreeFunctions.insert(F);
-    }
-  }
-  return SubtreeFunctions;
-}
-
-std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
-           isl::space>
-GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
-  SetVector<Value *> SubtreeValues;
-  SetVector<const SCEV *> SCEVs;
-  SetVector<const Loop *> Loops;
-  isl::space ParamSpace = isl::space(S.getIslCtx(), 0, 0).params();
-  SubtreeReferences References = {
-      LI,         SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(),
-      &ParamSpace};
-
-  for (const auto &I : IDToValue)
-    SubtreeValues.insert(I.second);
-
-  // NOTE: this is populated in IslNodeBuilder::addParameters
-  // See [Code generation of induction variables of loops outside Scops].
-  for (const auto &I : OutsideLoopIterations)
-    SubtreeValues.insert(cast<SCEVUnknown>(I.second)->getValue());
-
-  isl_ast_node_foreach_descendant_top_down(
-      Kernel->tree, collectReferencesInGPUStmt, &References);
-
-  for (const SCEV *Expr : SCEVs) {
-    findValues(Expr, SE, SubtreeValues);
-    findLoops(Expr, Loops);
-  }
-
-  Loops.remove_if([this](const Loop *L) {
-    return S.contains(L) || L->contains(S.getEntry());
-  });
-
-  for (auto &SAI : S.arrays())
-    SubtreeValues.remove(SAI->getBasePtr());
-
-  isl_space *Space = S.getParamSpace().release();
-  for (long i = 0, n = isl_space_dim(Space, isl_dim_param); i < n; i++) {
-    isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
-    assert(IDToValue.count(Id));
-    Value *Val = IDToValue[Id];
-    SubtreeValues.remove(Val);
-    isl_id_free(Id);
-  }
-  isl_space_free(Space);
-
-  for (long i = 0, n = isl_space_dim(Kernel->space, isl_dim_set); i < n; i++) {
-    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
-    assert(IDToValue.count(Id));
-    Value *Val = IDToValue[Id];
-    SubtreeValues.remove(Val);
-    isl_id_free(Id);
-  }
-
-  // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
-  // SubtreeValues. This is important, because we should not lose any
-  // SubtreeValues in the process of constructing the
-  // "ValidSubtree{Values, Functions} sets. Nor should the set
-  // ValidSubtree{Values, Functions} have any common element.
-  auto ValidSubtreeValuesIt =
-      make_filter_range(SubtreeValues, isValidSubtreeValue);
-  SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
-                                        ValidSubtreeValuesIt.end());
-
-  bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64;
-
-  SetVector<Function *> ValidSubtreeFunctions(
-      getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice));
-
-  // @see IslNodeBuilder::getReferencesInSubtree
-  SetVector<Value *> ReplacedValues;
-  for (Value *V : ValidSubtreeValues) {
-    auto It = ValueMap.find(V);
-    if (It == ValueMap.end())
-      ReplacedValues.insert(V);
-    else
-      ReplacedValues.insert(It->second);
-  }
-  return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops,
-                         ParamSpace);
-}
-
-void GPUNodeBuilder::clearDominators(Function *F) {
-  DomTreeNode *N = DT.getNode(&F->getEntryBlock());
-  std::vector<BasicBlock *> Nodes;
-  for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
-    Nodes.push_back(I->getBlock());
-
-  for (BasicBlock *BB : Nodes)
-    DT.eraseNode(BB);
-}
-
-void GPUNodeBuilder::clearScalarEvolution(Function *F) {
-  for (BasicBlock &BB : *F) {
-    Loop *L = LI.getLoopFor(&BB);
-    if (L)
-      SE.forgetLoop(L);
-  }
-}
-
-void GPUNodeBuilder::clearLoops(Function *F) {
-  SmallSet<Loop *, 1> WorkList;
-  for (BasicBlock &BB : *F) {
-    Loop *L = LI.getLoopFor(&BB);
-    if (L)
-      WorkList.insert(L);
-  }
-  for (auto *L : WorkList)
-    LI.erase(L);
-}
-
-std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
-  std::vector<Value *> Sizes;
-  isl::ast_build Context = isl::ast_build::from_context(S.getContext());
-
-  isl::multi_pw_aff GridSizePwAffs = isl::manage_copy(Kernel->grid_size);
-  for (long i = 0; i < Kernel->n_grid; i++) {
-    isl::pw_aff Size = GridSizePwAffs.at(i);
-    isl::ast_expr GridSize = Context.expr_from(Size);
-    Value *Res = ExprBuilder.create(GridSize.release());
-    Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
-    Sizes.push_back(Res);
-  }
-
-  for (long i = Kernel->n_grid; i < 3; i++)
-    Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
-
-  return std::make_tuple(Sizes[0], Sizes[1]);
-}
-
-std::tuple<Value *, Value *, Value *>
-GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
-  std::vector<Value *> Sizes;
-
-  for (long i = 0; i < Kernel->n_block; i++) {
-    Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
-    Sizes.push_back(Res);
-  }
-
-  for (long i = Kernel->n_block; i < 3; i++)
-    Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
-
-  return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
-}
-
-void GPUNodeBuilder::insertStoreParameter(Type *ArrayTy,
-                                          Instruction *Parameters,
-                                          Instruction *Param, int Index) {
-  Value *Slot = Builder.CreateGEP(
-      ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
-  Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
-  Builder.CreateStore(ParamTyped, Slot);
-}
-
-Value *
-GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
-                                       SetVector<Value *> SubtreeValues) {
-  const int NumArgs = F->arg_size();
-  std::vector<int> ArgSizes(NumArgs);
-
-  // If we are using the OpenCL Runtime, we need to add the kernel argument
-  // sizes to the end of the launch-parameter list, so OpenCL can determine
-  // how big the respective kernel arguments are.
-  // Here we need to reserve adequate space for that.
-  Type *ArrayTy;
-  if (Runtime == GPURuntime::OpenCL)
-    ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs);
-  else
-    ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumArgs);
-
-  BasicBlock *EntryBlock =
-      &Builder.GetInsertBlock()->getParent()->getEntryBlock();
-  auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();
-  std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
-  Instruction *Parameters = new AllocaInst(
-      ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());
-
-  int Index = 0;
-  for (long i = 0; i < Prog->n_array; i++) {
-    if (!ppcg_kernel_requires_array_argument(Kernel, i))
-      continue;
-
-    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
-    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
-
-    if (Runtime == GPURuntime::OpenCL)
-      ArgSizes[Index] = SAI->getElemSizeInBytes();
-
-    Value *DevArray = nullptr;
-    if (PollyManagedMemory) {
-      DevArray = getManagedDeviceArray(&Prog->array[i],
-                                       const_cast<ScopArrayInfo *>(SAI));
-    } else {
-      DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
-      DevArray = createCallGetDevicePtr(DevArray);
-    }
-    assert(DevArray != nullptr && "Array to be offloaded to device not "
-                                  "initialized");
-    Value *Offset = getArrayOffset(&Prog->array[i]);
-
-    if (Offset) {
-      DevArray = Builder.CreatePointerCast(
-          DevArray, SAI->getElementType()->getPointerTo());
-      DevArray = Builder.CreateGEP(SAI->getElementType(), DevArray,
-                                   Builder.CreateNeg(Offset));
-      DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
-    }
-    Value *Slot = Builder.CreateGEP(
-        ArrayTy, Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
-
-    if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
-      Value *ValPtr = nullptr;
-      if (PollyManagedMemory)
-        ValPtr = DevArray;
-      else
-        ValPtr = BlockGen.getOrCreateAlloca(SAI);
-
-      assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
-                                  " to be stored into Parameters");
-      Value *ValPtrCast =
-          Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
-      Builder.CreateStore(ValPtrCast, Slot);
-    } else {
-      Instruction *Param =
-          new AllocaInst(Builder.getInt8PtrTy(), AddressSpace,
-                         Launch + "_param_" + std::to_string(Index),
-                         EntryBlock->getTerminator());
-      Builder.CreateStore(DevArray, Param);
-      Value *ParamTyped =
-          Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
-      Builder.CreateStore(ParamTyped, Slot);
-    }
-    Index++;
-  }
-
-  int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
-
-  for (long i = 0; i < NumHostIters; i++) {
-    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
-    Value *Val = IDToValue[Id];
-    isl_id_free(Id);
-
-    if (Runtime == GPURuntime::OpenCL)
-      ArgSizes[Index] = computeSizeInBytes(Val->getType());
-
-    Instruction *Param =
-        new AllocaInst(Val->getType(), AddressSpace,
-                       Launch + "_param_" + std::to_string(Index),
-                       EntryBlock->getTerminator());
-    Builder.CreateStore(Val, Param);
-    insertStoreParameter(ArrayTy, Parameters, Param, Index);
-    Index++;
-  }
-
-  int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
-
-  for (long i = 0; i < NumVars; i++) {
-    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
-    Value *Val = IDToValue[Id];
-    if (ValueMap.count(Val))
-      Val = ValueMap[Val];
-    isl_id_free(Id);
-
-    if (Runtime == GPURuntime::OpenCL)
-      ArgSizes[Index] = computeSizeInBytes(Val->getType());
-
-    Instruction *Param =
-        new AllocaInst(Val->getType(), AddressSpace,
-                       Launch + "_param_" + std::to_string(Index),
-                       EntryBlock->getTerminator());
-    Builder.CreateStore(Val, Param);
-    insertStoreParameter(ArrayTy, Parameters, Param, Index);
-    Index++;
-  }
-
-  for (auto Val : SubtreeValues) {
-    if (Runtime == GPURuntime::OpenCL)
-      ArgSizes[Index] = computeSizeInBytes(Val->getType());
-
-    Instruction *Param =
-        new AllocaInst(Val->getType(), AddressSpace,
-                       Launch + "_param_" + std::to_string(Index),
-                       EntryBlock->getTerminator());
-    Builder.CreateStore(Val, Param);
-    insertStoreParameter(ArrayTy, Parameters, Param, Index);
-    Index++;
-  }
-
-  if (Runtime == GPURuntime::OpenCL) {
-    for (int i = 0; i < NumArgs; i++) {
-      Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]);
-      Instruction *Param =
-          new AllocaInst(Builder.getInt32Ty(), AddressSpace,
-                         Launch + "_param_size_" + std::to_string(i),
-                         EntryBlock->getTerminator());
-      Builder.CreateStore(Val, Param);
-      insertStoreParameter(ArrayTy, Parameters, Param, Index);
-      Index++;
-    }
-  }
-
-  auto Location = EntryBlock->getTerminator();
-  return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
-                         Launch + "_params_i8ptr", Location);
-}
-
-void GPUNodeBuilder::setupKernelSubtreeFunctions(
-    SetVector<Function *> SubtreeFunctions) {
-  for (auto Fn : SubtreeFunctions) {
-    const std::string ClonedFnName = Fn->getName().str();
-    Function *Clone = GPUModule->getFunction(ClonedFnName);
-    if (!Clone)
-      Clone =
-          Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
-                           ClonedFnName, GPUModule.get());
-    assert(Clone && "Expected cloned function to be initialized.");
-    assert(ValueMap.find(Fn) == ValueMap.end() &&
-           "Fn already present in ValueMap");
-    ValueMap[Fn] = Clone;
-  }
-}
-void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
-  isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
-  ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
-  isl_id_free(Id);
-  isl_ast_node_free(KernelStmt);
-
-  if (Kernel->n_grid > 1)
-    DeepestParallel = std::max(
-        DeepestParallel, (unsigned)isl_space_dim(Kernel->space, isl_dim_set));
-  else
-    DeepestSequential = std::max(
-        DeepestSequential, (unsigned)isl_space_dim(Kernel->space, isl_dim_set));
-
-  Value *BlockDimX, *BlockDimY, *BlockDimZ;
-  std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
-
-  SetVector<Value *> SubtreeValues;
-  SetVector<Function *> SubtreeFunctions;
-  SetVector<const Loop *> Loops;
-  isl::space ParamSpace;
-  std::tie(SubtreeValues, SubtreeFunctions, Loops, ParamSpace) =
-      getReferencesInKernel(Kernel);
-
-  // Add parameters that appear only in the access function to the kernel
-  // space. This is important to make sure that all isl_ids are passed as
-  // parameters to the kernel, even though we may not have all parameters
-  // in the context to improve compile time.
-  Kernel->space = isl_space_align_params(Kernel->space, ParamSpace.release());
-
-  assert(Kernel->tree && "Device AST of kernel node is empty");
-
-  Instruction &HostInsertPoint = *Builder.GetInsertPoint();
-  IslExprBuilder::IDToValueTy HostIDs = IDToValue;
-  ValueMapT HostValueMap = ValueMap;
-  BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap;
-  ScalarMap.clear();
-  BlockGenerator::EscapeUsersAllocaMapTy HostEscapeMap = EscapeMap;
-  EscapeMap.clear();
-
-  // Create for all loops we depend on values that contain the current loop
-  // iteration. These values are necessary to generate code for SCEVs that
-  // depend on such loops. As a result we need to pass them to the subfunction.
-  for (const Loop *L : Loops) {
-    const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
-                                            SE.getUnknown(Builder.getInt64(1)),
-                                            L, SCEV::FlagAnyWrap);
-    Value *V = generateSCEV(OuterLIV);
-    OutsideLoopIterations[L] = SE.getUnknown(V);
-    SubtreeValues.insert(V);
-  }
-
-  createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
-  setupKernelSubtreeFunctions(SubtreeFunctions);
-
-  create(isl_ast_node_copy(Kernel->tree));
-
-  finalizeKernelArguments(Kernel);
-  Function *F = Builder.GetInsertBlock()->getParent();
-  if (Arch == GPUArch::NVPTX64)
-    addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
-  clearDominators(F);
-  clearScalarEvolution(F);
-  clearLoops(F);
-
-  IDToValue = HostIDs;
-
-  ValueMap = std::move(HostValueMap);
-  ScalarMap = std::move(HostScalarMap);
-  EscapeMap = std::move(HostEscapeMap);
-  IDToSAI.clear();
-  Annotator.resetAlternativeAliasBases();
-  for (auto &BasePtr : LocalArrays)
-    S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array);
-  LocalArrays.clear();
-
-  std::string ASMString = finalizeKernelFunction();
-  Builder.SetInsertPoint(&HostInsertPoint);
-  Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
-
-  std::string Name = getKernelFuncName(Kernel->id);
-  Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
-  Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
-  Value *GPUKernel = createCallGetKernel(KernelString, NameString);
-
-  Value *GridDimX, *GridDimY;
-  std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
-
-  createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
-                         BlockDimZ, Parameters);
-  createCallFreeKernel(GPUKernel);
-
-  for (auto Id : KernelIds)
-    isl_id_free(Id);
-
-  KernelIds.clear();
-}
-
-/// Compute the DataLayout string for the NVPTX backend.
-///
-/// @param is64Bit Are we looking for a 64 bit architecture?
-static std::string computeNVPTXDataLayout(bool is64Bit) {
-  std::string Ret = "";
-
-  if (!is64Bit) {
-    Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
-           "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
-           "64-v128:128:128-n16:32:64";
-  } else {
-    Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
-           "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
-           "64-v128:128:128-n16:32:64";
-  }
-
-  return Ret;
-}
-
-/// Compute the DataLayout string for a SPIR kernel.
-///
-/// @param is64Bit Are we looking for a 64 bit architecture?
-static std::string computeSPIRDataLayout(bool is64Bit) {
-  std::string Ret = "";
-
-  if (!is64Bit) {
-    Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
-           "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
-           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
-           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
-  } else {
-    Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
-           "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
-           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
-           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
-  }
-
-  return Ret;
-}
-
-Function *
-GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
-                                         SetVector<Value *> &SubtreeValues) {
-  std::vector<Type *> Args;
-  std::string Identifier = getKernelFuncName(Kernel->id);
-
-  std::vector<Metadata *> MemoryType;
-
-  for (long i = 0; i < Prog->n_array; i++) {
-    if (!ppcg_kernel_requires_array_argument(Kernel, i))
-      continue;
-
-    if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
-      isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
-      const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
-      Args.push_back(SAI->getElementType());
-      MemoryType.push_back(
-          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
-    } else {
-      static const int UseGlobalMemory = 1;
-      Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
-      MemoryType.push_back(
-          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
-    }
-  }
-
-  int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
-
-  for (long i = 0; i < NumHostIters; i++) {
-    Args.push_back(Builder.getInt64Ty());
-    MemoryType.push_back(
-        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
-  }
-
-  int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
-
-  for (long i = 0; i < NumVars; i++) {
-    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
-    Value *Val = IDToValue[Id];
-    isl_id_free(Id);
-    Args.push_back(Val->getType());
-    MemoryType.push_back(
-        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
-  }
-
-  for (auto *V : SubtreeValues) {
-    Args.push_back(V->getType());
-    MemoryType.push_back(
-        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
-  }
-
-  auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
-  auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
-                              GPUModule.get());
-
-  std::vector<Metadata *> EmptyStrings;
-
-  for (unsigned int i = 0; i < MemoryType.size(); i++) {
-    EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
-  }
-
-  if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
-    FN->setMetadata("kernel_arg_addr_space",
-                    MDNode::get(FN->getContext(), MemoryType));
-    FN->setMetadata("kernel_arg_name",
-                    MDNode::get(FN->getContext(), EmptyStrings));
-    FN->setMetadata("kernel_arg_access_qual",
-                    MDNode::get(FN->getContext(), EmptyStrings));
-    FN->setMetadata("kernel_arg_type",
-                    MDNode::get(FN->getContext(), EmptyStrings));
-    FN->setMetadata("kernel_arg_type_qual",
-                    MDNode::get(FN->getContext(), EmptyStrings));
-    FN->setMetadata("kernel_arg_base_type",
-                    MDNode::get(FN->getContext(), EmptyStrings));
-  }
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    FN->setCallingConv(CallingConv::PTX_Kernel);
-    break;
-  case GPUArch::SPIR32:
-  case GPUArch::SPIR64:
-    FN->setCallingConv(CallingConv::SPIR_KERNEL);
-    break;
-  }
-
-  auto Arg = FN->arg_begin();
-  for (long i = 0; i < Kernel->n_array; i++) {
-    if (!ppcg_kernel_requires_array_argument(Kernel, i))
-      continue;
-
-    Arg->setName(Kernel->array[i].array->name);
-
-    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
-    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
-    Type *EleTy = SAI->getElementType();
-    Value *Val = &*Arg;
-    SmallVector<const SCEV *, 4> Sizes;
-    isl_ast_build *Build =
-        isl_ast_build_from_context(isl_set_copy(Prog->context));
-    Sizes.push_back(nullptr);
-    for (long j = 1, n = Kernel->array[i].array->n_index; j < n; j++) {
-      isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
-          Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j));
-      auto V = ExprBuilder.create(DimSize);
-      Sizes.push_back(SE.getSCEV(V));
-    }
-    const ScopArrayInfo *SAIRep =
-        S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array);
-    LocalArrays.push_back(Val);
-
-    isl_ast_build_free(Build);
-    KernelIds.push_back(Id);
-    IDToSAI[Id] = SAIRep;
-    Arg++;
-  }
-
-  for (long i = 0; i < NumHostIters; i++) {
-    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
-    Arg->setName(isl_id_get_name(Id));
-    IDToValue[Id] = &*Arg;
-    KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
-    Arg++;
-  }
-
-  for (long i = 0; i < NumVars; i++) {
-    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
-    Arg->setName(isl_id_get_name(Id));
-    Value *Val = IDToValue[Id];
-    ValueMap[Val] = &*Arg;
-    IDToValue[Id] = &*Arg;
-    KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
-    Arg++;
-  }
-
-  for (auto *V : SubtreeValues) {
-    Arg->setName(V->getName());
-    ValueMap[V] = &*Arg;
-    Arg++;
-  }
-
-  return FN;
-}
-
-void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
-  Intrinsic::ID IntrinsicsBID[2];
-  Intrinsic::ID IntrinsicsTID[3];
-
-  switch (Arch) {
-  case GPUArch::SPIR64:
-  case GPUArch::SPIR32:
-    llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
-  case GPUArch::NVPTX64:
-    IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
-    IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
-
-    IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
-    IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
-    IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
-    break;
-  }
-
-  auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
-    std::string Name = isl_id_get_name(Id);
-    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-    Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr);
-    Value *Val = Builder.CreateCall(IntrinsicFn, {});
-    Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
-    IDToValue[Id] = Val;
-    KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
-  };
-
-  for (int i = 0; i < Kernel->n_grid; ++i) {
-    isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i);
-    addId(Id, IntrinsicsBID[i]);
-  }
-
-  for (int i = 0; i < Kernel->n_block; ++i) {
-    isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i);
-    addId(Id, IntrinsicsTID[i]);
-  }
-}
-
-void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel,
-                                           bool SizeTypeIs64bit) {
-  const char *GroupName[3] = {"__gen_ocl_get_group_id0",
-                              "__gen_ocl_get_group_id1",
-                              "__gen_ocl_get_group_id2"};
-
-  const char *LocalName[3] = {"__gen_ocl_get_local_id0",
-                              "__gen_ocl_get_local_id1",
-                              "__gen_ocl_get_local_id2"};
-  IntegerType *SizeT =
-      SizeTypeIs64bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
-
-  auto createFunc = [this](const char *Name, __isl_take isl_id *Id,
-                           IntegerType *SizeT) mutable {
-    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-    Function *FN = M->getFunction(Name);
-
-    // If FN is not available, declare it.
-    if (!FN) {
-      GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-      std::vector<Type *> Args;
-      FunctionType *Ty = FunctionType::get(SizeT, Args, false);
-      FN = Function::Create(Ty, Linkage, Name, M);
-      FN->setCallingConv(CallingConv::SPIR_FUNC);
-    }
-
-    Value *Val = Builder.CreateCall(FN, {});
-    if (SizeT == Builder.getInt32Ty())
-      Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
-    IDToValue[Id] = Val;
-    KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
-  };
-
-  for (int i = 0; i < Kernel->n_grid; ++i)
-    createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i), SizeT);
-
-  for (int i = 0; i < Kernel->n_block; ++i)
-    createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i), SizeT);
-}
-
-void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
-  auto Arg = FN->arg_begin();
-  for (long i = 0; i < Kernel->n_array; i++) {
-    if (!ppcg_kernel_requires_array_argument(Kernel, i))
-      continue;
-
-    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
-    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
-    isl_id_free(Id);
-
-    if (SAI->getNumberOfDimensions() > 0) {
-      Arg++;
-      continue;
-    }
-
-    Value *Val = &*Arg;
-
-    if (!gpu_array_is_read_only_scalar(&Prog->array[i])) {
-      Type *TypePtr = SAI->getElementType()->getPointerTo();
-      Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr);
-      Val = Builder.CreateLoad(SAI->getElementType(), TypedArgPtr);
-    }
-
-    Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
-    Builder.CreateStore(Val, Alloca);
-
-    Arg++;
-  }
-}
-
-void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
-  auto *FN = Builder.GetInsertBlock()->getParent();
-  auto Arg = FN->arg_begin();
-
-  bool StoredScalar = false;
-  for (long i = 0; i < Kernel->n_array; i++) {
-    if (!ppcg_kernel_requires_array_argument(Kernel, i))
-      continue;
-
-    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
-    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage_copy(Id));
-    isl_id_free(Id);
-
-    if (SAI->getNumberOfDimensions() > 0) {
-      Arg++;
-      continue;
-    }
-
-    if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
-      Arg++;
-      continue;
-    }
-
-    Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
-    Value *ArgPtr = &*Arg;
-    Type *TypePtr = SAI->getElementType()->getPointerTo();
-    Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
-    Value *Val = Builder.CreateLoad(SAI->getElementType(), Alloca);
-    Builder.CreateStore(Val, TypedArgPtr);
-    StoredScalar = true;
-
-    Arg++;
-  }
-
-  if (StoredScalar) {
-    /// In case more than one thread contains scalar stores, the generated
-    /// code might be incorrect, if we only store at the end of the kernel.
-    /// To support this case we need to store these scalars back at each
-    /// memory store or at least before each kernel barrier.
-    if (Kernel->n_block != 0 || Kernel->n_grid != 0) {
-      BuildSuccessful = 0;
-      LLVM_DEBUG(
-          dbgs() << getUniqueScopName(&S)
-                 << " has a store to a scalar value that"
-                    " would be undefined to run in parallel. Bailing out.\n";);
-    }
-  }
-}
-
-void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-
-  for (int i = 0; i < Kernel->n_var; ++i) {
-    struct ppcg_kernel_var &Var = Kernel->var[i];
-    isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set);
-    Type *EleTy = ScopArrayInfo::getFromId(isl::manage(Id))->getElementType();
-
-    Type *ArrayTy = EleTy;
-    SmallVector<const SCEV *, 4> Sizes;
-
-    Sizes.push_back(nullptr);
-    for (unsigned int j = 1; j < Var.array->n_index; ++j) {
-      isl_val *Val = isl_vec_get_element_val(Var.size, j);
-      long Bound = isl_val_get_num_si(Val);
-      isl_val_free(Val);
-      Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
-    }
-
-    for (int j = Var.array->n_index - 1; j >= 0; --j) {
-      isl_val *Val = isl_vec_get_element_val(Var.size, j);
-      long Bound = isl_val_get_num_si(Val);
-      isl_val_free(Val);
-      ArrayTy = ArrayType::get(ArrayTy, Bound);
-    }
-
-    const ScopArrayInfo *SAI;
-    Value *Allocation;
-    if (Var.type == ppcg_access_shared) {
-      auto GlobalVar = new GlobalVariable(
-          *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name,
-          nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
-      GlobalVar->setAlignment(llvm::Align(EleTy->getPrimitiveSizeInBits() / 8));
-      GlobalVar->setInitializer(Constant::getNullValue(ArrayTy));
-
-      Allocation = GlobalVar;
-    } else if (Var.type == ppcg_access_private) {
-      Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array");
-    } else {
-      llvm_unreachable("unknown variable type");
-    }
-    SAI =
-        S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array);
-    Id = isl_id_alloc(S.getIslCtx().get(), Var.name, nullptr);
-    IDToValue[Id] = Allocation;
-    LocalArrays.push_back(Allocation);
-    KernelIds.push_back(Id);
-    IDToSAI[Id] = SAI;
-  }
-}
-
-void GPUNodeBuilder::createKernelFunction(
-    ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
-    SetVector<Function *> &SubtreeFunctions) {
-  std::string Identifier = getKernelFuncName(Kernel->id);
-  GPUModule.reset(new Module(Identifier, Builder.getContext()));
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    if (Runtime == GPURuntime::CUDA)
-      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
-    else if (Runtime == GPURuntime::OpenCL)
-      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
-    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
-    break;
-  case GPUArch::SPIR32:
-    GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
-    GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
-    break;
-  case GPUArch::SPIR64:
-    GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
-    GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
-    break;
-  }
-
-  Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
-
-  BasicBlock *PrevBlock = Builder.GetInsertBlock();
-  auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
-
-  DT.addNewBlock(EntryBlock, PrevBlock);
-
-  Builder.SetInsertPoint(EntryBlock);
-  Builder.CreateRetVoid();
-  Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
-
-  ScopDetection::markFunctionAsInvalid(FN);
-
-  prepareKernelArguments(Kernel, FN);
-  createKernelVariables(Kernel, FN);
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    insertKernelIntrinsics(Kernel);
-    break;
-  case GPUArch::SPIR32:
-    insertKernelCallsSPIR(Kernel, false);
-    break;
-  case GPUArch::SPIR64:
-    insertKernelCallsSPIR(Kernel, true);
-    break;
-  }
-}
-
-std::string GPUNodeBuilder::createKernelASM() {
-  llvm::Triple GPUTriple;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    switch (Runtime) {
-    case GPURuntime::CUDA:
-      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
-      break;
-    case GPURuntime::OpenCL:
-      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
-      break;
-    }
-    break;
-  case GPUArch::SPIR64:
-  case GPUArch::SPIR32:
-    std::string SPIRAssembly;
-    raw_string_ostream IROstream(SPIRAssembly);
-    IROstream << *GPUModule;
-    IROstream.flush();
-    return SPIRAssembly;
-  }
-
-  std::string ErrMsg;
-  auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
-
-  if (!GPUTarget) {
-    errs() << ErrMsg << "\n";
-    return "";
-  }
-
-  TargetOptions Options;
-  Options.UnsafeFPMath = FastMath;
-
-  std::string subtarget;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    subtarget = CudaVersion;
-    break;
-  case GPUArch::SPIR32:
-  case GPUArch::SPIR64:
-    llvm_unreachable("No subtarget for SPIR architecture");
-  }
-
-  std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
-      GPUTriple.getTriple(), subtarget, "", Options, std::nullopt));
-
-  SmallString<0> ASMString;
-  raw_svector_ostream ASMStream(ASMString);
-  llvm::legacy::PassManager PM;
-
-  PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
-
-  if (TargetM->addPassesToEmitFile(PM, ASMStream, nullptr, CGFT_AssemblyFile,
-                                   true /* verify */)) {
-    errs() << "The target does not support generation of this file type!\n";
-    return "";
-  }
-
-  PM.run(*GPUModule);
-
-  return ASMStream.str().str();
-}
-
-bool GPUNodeBuilder::requiresCUDALibDevice() {
-  bool RequiresLibDevice = false;
-  for (Function &F : GPUModule->functions()) {
-    if (!F.isDeclaration())
-      continue;
-
-    const std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(F.getName());
-    if (CUDALibDeviceFunc.length() != 0) {
-      // We need to handle the case where a module looks like this:
-      // @expf(..)
-      // @llvm.exp.f64(..)
-      // Both of these functions would be renamed to `__nv_expf`.
-      //
-      // So, we must first check for the existence of the libdevice function.
-      // If this exists, we replace our current function with it.
-      //
-      // If it does not exist, we rename the current function to the
-      // libdevice functiono name.
-      if (Function *Replacement = F.getParent()->getFunction(CUDALibDeviceFunc))
-        F.replaceAllUsesWith(Replacement);
-      else
-        F.setName(CUDALibDeviceFunc);
-      RequiresLibDevice = true;
-    }
-  }
-
-  return RequiresLibDevice;
-}
-
-void GPUNodeBuilder::addCUDALibDevice() {
-  if (Arch != GPUArch::NVPTX64)
-    return;
-
-  if (requiresCUDALibDevice()) {
-    SMDiagnostic Error;
-
-    errs() << CUDALibDevice << "\n";
-    auto LibDeviceModule =
-        parseIRFile(CUDALibDevice, Error, GPUModule->getContext());
-
-    if (!LibDeviceModule) {
-      BuildSuccessful = false;
-      report_fatal_error("Could not find or load libdevice. Skipping GPU "
-                         "kernel generation. Please set -polly-acc-libdevice "
-                         "accordingly.\n");
-      return;
-    }
-
-    Linker L(*GPUModule);
-
-    // Set an nvptx64 target triple to avoid linker warnings. The original
-    // triple of the libdevice files are nvptx-unknown-unknown.
-    LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
-    L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded);
-  }
-}
-
-std::string GPUNodeBuilder::finalizeKernelFunction() {
-
-  if (verifyModule(*GPUModule)) {
-    LLVM_DEBUG(dbgs() << "verifyModule failed on module:\n";
-               GPUModule->print(dbgs(), nullptr); dbgs() << "\n";);
-    LLVM_DEBUG(dbgs() << "verifyModule Error:\n";
-               verifyModule(*GPUModule, &dbgs()););
-
-    if (FailOnVerifyModuleFailure)
-      llvm_unreachable("VerifyModule failed.");
-
-    BuildSuccessful = false;
-    return "";
-  }
-
-  addCUDALibDevice();
-
-  if (DumpKernelIR)
-    outs() << *GPUModule << "\n";
-
-  if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
-    // Optimize module.
-    llvm::legacy::PassManager OptPasses;
-    PassManagerBuilder PassBuilder;
-    PassBuilder.OptLevel = 3;
-    PassBuilder.SizeLevel = 0;
-    PassBuilder.populateModulePassManager(OptPasses);
-    OptPasses.run(*GPUModule);
-  }
-
-  std::string Assembly = createKernelASM();
-
-  if (DumpKernelASM)
-    outs() << Assembly << "\n";
-
-  GPUModule.release();
-  KernelIDs.clear();
-
-  return Assembly;
-}
-/// Construct an `isl_pw_aff_list` from a vector of `isl_pw_aff`
-/// @param PwAffs The list of piecewise affine functions to create an
-///               `isl_pw_aff_list` from. We expect an rvalue ref because
-///               all the isl_pw_aff are used up by this function.
-///
-/// @returns  The `isl_pw_aff_list`.
-__isl_give isl_pw_aff_list *
-createPwAffList(isl_ctx *Context,
-                const std::vector<__isl_take isl_pw_aff *> &&PwAffs) {
-  isl_pw_aff_list *List = isl_pw_aff_list_alloc(Context, PwAffs.size());
-
-  for (unsigned i = 0; i < PwAffs.size(); i++) {
-    List = isl_pw_aff_list_insert(List, i, PwAffs[i]);
-  }
-  return List;
-}
-
-/// Align all the `PwAffs` such that they have the same parameter dimensions.
-///
-/// We loop over all `pw_aff` and align all of their spaces together to
-/// create a common space for all the `pw_aff`. This common space is the
-/// `AlignSpace`. We then align all the `pw_aff` to this space. We start
-/// with the given `SeedSpace`.
-/// @param PwAffs    The list of piecewise affine functions we want to align.
-///                  This is an rvalue reference because the entire vector is
-///                  used up by the end of the operation.
-/// @param SeedSpace The space to start the alignment process with.
-/// @returns         A std::pair, whose first element is the aligned space,
-///                  whose second element is the vector of aligned piecewise
-///                  affines.
-static std::pair<__isl_give isl_space *, std::vector<__isl_give isl_pw_aff *>>
-alignPwAffs(const std::vector<__isl_take isl_pw_aff *> &&PwAffs,
-            __isl_take isl_space *SeedSpace) {
-  assert(SeedSpace && "Invalid seed space given.");
-
-  isl_space *AlignSpace = SeedSpace;
-  for (isl_pw_aff *PwAff : PwAffs) {
-    isl_space *PwAffSpace = isl_pw_aff_get_domain_space(PwAff);
-    AlignSpace = isl_space_align_params(AlignSpace, PwAffSpace);
-  }
-  std::vector<isl_pw_aff *> AdjustedPwAffs;
-
-  for (unsigned i = 0; i < PwAffs.size(); i++) {
-    isl_pw_aff *Adjusted = PwAffs[i];
-    assert(Adjusted && "Invalid pw_aff given.");
-    Adjusted = isl_pw_aff_align_params(Adjusted, isl_space_copy(AlignSpace));
-    AdjustedPwAffs.push_back(Adjusted);
-  }
-  return std::make_pair(AlignSpace, AdjustedPwAffs);
-}
-
-namespace {
-class PPCGCodeGeneration final : public ScopPass {
-public:
-  static char ID;
-
-  GPURuntime Runtime = GPURuntime::CUDA;
-
-  GPUArch Architecture = GPUArch::NVPTX64;
-
-  /// The scop that is currently processed.
-  Scop *S;
-
-  LoopInfo *LI;
-  DominatorTree *DT;
-  ScalarEvolution *SE;
-  const DataLayout *DL;
-  RegionInfo *RI;
-
-  PPCGCodeGeneration() : ScopPass(ID) {
-    // Apply defaults.
-    Runtime = GPURuntimeChoice;
-    Architecture = GPUArchChoice;
-  }
-
-  /// Construct compilation options for PPCG.
-  ///
-  /// @returns The compilation options.
-  ppcg_options *createPPCGOptions() {
-    auto DebugOptions =
-        (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options));
-    auto Options = (ppcg_options *)malloc(sizeof(ppcg_options));
-
-    DebugOptions->dump_schedule_constraints = false;
-    DebugOptions->dump_schedule = false;
-    DebugOptions->dump_final_schedule = false;
-    DebugOptions->dump_sizes = false;
-    DebugOptions->verbose = false;
-
-    Options->debug = DebugOptions;
-
-    Options->group_chains = false;
-    Options->reschedule = true;
-    Options->scale_tile_loops = false;
-    Options->wrap = false;
-
-    Options->non_negative_parameters = false;
-    Options->ctx = nullptr;
-    Options->sizes = nullptr;
-
-    Options->tile = true;
-    Options->tile_size = 32;
-
-    Options->isolate_full_tiles = false;
-
-    Options->use_private_memory = PrivateMemory;
-    Options->use_shared_memory = SharedMemory;
-    Options->max_shared_memory = 48 * 1024;
-
-    Options->target = PPCG_TARGET_CUDA;
-    Options->openmp = false;
-    Options->linearize_device_arrays = true;
-    Options->allow_gnu_extensions = false;
-
-    Options->unroll_copy_shared = false;
-    Options->unroll_gpu_tile = false;
-    Options->live_range_reordering = true;
-
-    Options->live_range_reordering = true;
-    Options->hybrid = false;
-    Options->opencl_compiler_options = nullptr;
-    Options->opencl_use_gpu = false;
-    Options->opencl_n_include_file = 0;
-    Options->opencl_include_files = nullptr;
-    Options->opencl_print_kernel_types = false;
-    Options->opencl_embed_kernel_code = false;
-
-    Options->save_schedule_file = nullptr;
-    Options->load_schedule_file = nullptr;
-
-    return Options;
-  }
-
-  /// Get a tagged access relation containing all accesses of type @p AccessTy.
-  ///
-  /// Instead of a normal access of the form:
-  ///
-  ///   Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
-  ///
-  /// a tagged access has the form
-  ///
-  ///   [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
-  ///
-  /// where 'id' is an additional space that references the memory access that
-  /// triggered the access.
-  ///
-  /// @param AccessTy The type of the memory accesses to collect.
-  ///
-  /// @return The relation describing all tagged memory accesses.
-  isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) {
-    isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace().release());
-
-    for (auto &Stmt : *S)
-      for (auto &Acc : Stmt)
-        if (Acc->getType() == AccessTy) {
-          isl_map *Relation = Acc->getAccessRelation().release();
-          Relation =
-              isl_map_intersect_domain(Relation, Stmt.getDomain().release());
-
-          isl_space *Space = isl_map_get_space(Relation);
-          Space = isl_space_range(Space);
-          Space = isl_space_from_range(Space);
-          Space =
-              isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
-          isl_map *Universe = isl_map_universe(Space);
-          Relation = isl_map_domain_product(Relation, Universe);
-          Accesses = isl_union_map_add_map(Accesses, Relation);
-        }
-
-    return Accesses;
-  }
-
-  /// Get the set of all read accesses, tagged with the access id.
-  ///
-  /// @see getTaggedAccesses
-  isl_union_map *getTaggedReads() {
-    return getTaggedAccesses(MemoryAccess::READ);
-  }
-
-  /// Get the set of all may (and must) accesses, tagged with the access id.
-  ///
-  /// @see getTaggedAccesses
-  isl_union_map *getTaggedMayWrites() {
-    return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE),
-                               getTaggedAccesses(MemoryAccess::MUST_WRITE));
-  }
-
-  /// Get the set of all must accesses, tagged with the access id.
-  ///
-  /// @see getTaggedAccesses
-  isl_union_map *getTaggedMustWrites() {
-    return getTaggedAccesses(MemoryAccess::MUST_WRITE);
-  }
-
-  /// Collect parameter and array names as isl_ids.
-  ///
-  /// To reason about the different parameters and arrays used, ppcg requires
-  /// a list of all isl_ids in use. As PPCG traditionally performs
-  /// source-to-source compilation each of these isl_ids is mapped to the
-  /// expression that represents it. As we do not have a corresponding
-  /// expression in Polly, we just map each id to a 'zero' expression to match
-  /// the data format that ppcg expects.
-  ///
-  /// @returns Retun a map from collected ids to 'zero' ast expressions.
-  __isl_give isl_id_to_ast_expr *getNames() {
-    auto *Names = isl_id_to_ast_expr_alloc(
-        S->getIslCtx().get(),
-        S->getNumParams() + std::distance(S->array_begin(), S->array_end()));
-    auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx().get()));
-
-    for (const SCEV *P : S->parameters()) {
-      isl_id *Id = S->getIdForParam(P).release();
-      Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
-    }
-
-    for (auto &Array : S->arrays()) {
-      auto Id = Array->getBasePtrId().release();
-      Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
-    }
-
-    isl_ast_expr_free(Zero);
-
-    return Names;
-  }
-
-  /// Create a new PPCG scop from the current scop.
-  ///
-  /// The PPCG scop is initialized with data from the current polly::Scop. From
-  /// this initial data, the data-dependences in the PPCG scop are initialized.
-  /// We do not use Polly's dependence analysis for now, to ensure we match
-  /// the PPCG default behaviour more closely.
-  ///
-  /// @returns A new ppcg scop.
-  ppcg_scop *createPPCGScop() {
-    MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
-
-    auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
-
-    PPCGScop->options = createPPCGOptions();
-    // enable live range reordering
-    PPCGScop->options->live_range_reordering = 1;
-
-    PPCGScop->start = 0;
-    PPCGScop->end = 0;
-
-    PPCGScop->context = S->getContext().release();
-    PPCGScop->domain = S->getDomains().release();
-    // TODO: investigate this further. PPCG calls collect_call_domains.
-    PPCGScop->call = isl_union_set_from_set(S->getContext().release());
-    PPCGScop->tagged_reads = getTaggedReads();
-    PPCGScop->reads = S->getReads().release();
-    PPCGScop->live_in = nullptr;
-    PPCGScop->tagged_may_writes = getTaggedMayWrites();
-    PPCGScop->may_writes = S->getWrites().release();
-    PPCGScop->tagged_must_writes = getTaggedMustWrites();
-    PPCGScop->must_writes = S->getMustWrites().release();
-    PPCGScop->live_out = nullptr;
-    PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.release();
-    PPCGScop->must_kills = KillsInfo.MustKills.release();
-
-    PPCGScop->tagger = nullptr;
-    PPCGScop->independence =
-        isl_union_map_empty(isl_set_get_space(PPCGScop->context));
-    PPCGScop->dep_flow = nullptr;
-    PPCGScop->tagged_dep_flow = nullptr;
-    PPCGScop->dep_false = nullptr;
-    PPCGScop->dep_forced = nullptr;
-    PPCGScop->dep_order = nullptr;
-    PPCGScop->tagged_dep_order = nullptr;
-
-    PPCGScop->schedule = S->getScheduleTree().release();
-    // If we have something non-trivial to kill, add it to the schedule
-    if (KillsInfo.KillsSchedule.get())
-      PPCGScop->schedule = isl_schedule_sequence(
-          PPCGScop->schedule, KillsInfo.KillsSchedule.release());
-
-    PPCGScop->names = getNames();
-    PPCGScop->pet = nullptr;
-
-    compute_tagger(PPCGScop);
-    compute_dependences(PPCGScop);
-    eliminate_dead_code(PPCGScop);
-
-    return PPCGScop;
-  }
-
-  /// Collect the array accesses in a statement.
-  ///
-  /// @param Stmt The statement for which to collect the accesses.
-  ///
-  /// @returns A list of array accesses.
-  gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) {
-    gpu_stmt_access *Accesses = nullptr;
-
-    for (MemoryAccess *Acc : Stmt) {
-      auto Access =
-          isl_alloc_type(S->getIslCtx().get(), struct gpu_stmt_access);
-      Access->read = Acc->isRead();
-      Access->write = Acc->isWrite();
-      Access->access = Acc->getAccessRelation().release();
-      isl_space *Space = isl_map_get_space(Access->access);
-      Space = isl_space_range(Space);
-      Space = isl_space_from_range(Space);
-      Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
-      isl_map *Universe = isl_map_universe(Space);
-      Access->tagged_access =
-          isl_map_domain_product(Acc->getAccessRelation().release(), Universe);
-      Access->exact_write = !Acc->isMayWrite();
-      Access->ref_id = Acc->getId().release();
-      Access->next = Accesses;
-      Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
-      // TODO: Also mark one-element accesses to arrays as fixed-element.
-      Access->fixed_element =
-          Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false;
-      Accesses = Access;
-    }
-
-    return Accesses;
-  }
-
-  /// Collect the list of GPU statements.
-  ///
-  /// Each statement has an id, a pointer to the underlying data structure,
-  /// as well as a list with all memory accesses.
-  ///
-  /// TODO: Initialize the list of memory accesses.
-  ///
-  /// @returns A linked-list of statements.
-  gpu_stmt *getStatements() {
-    gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx().get(), struct gpu_stmt,
-                                       std::distance(S->begin(), S->end()));
-
-    int i = 0;
-    for (auto &Stmt : *S) {
-      gpu_stmt *GPUStmt = &Stmts[i];
-
-      GPUStmt->id = Stmt.getDomainId().release();
-
-      // We use the pet stmt pointer to keep track of the Polly statements.
-      GPUStmt->stmt = (pet_stmt *)&Stmt;
-      GPUStmt->accesses = getStmtAccesses(Stmt);
-      i++;
-    }
-
-    return Stmts;
-  }
-
-  /// Derive the extent of an array.
-  ///
-  /// The extent of an array is the set of elements that are within the
-  /// accessed array. For the inner dimensions, the extent constraints are
-  /// 0 and the size of the corresponding array dimension. For the first
-  /// (outermost) dimension, the extent constraints are the minimal and maximal
-  /// subscript value for the first dimension.
-  ///
-  /// @param Array The array to derive the extent for.
-  ///
-  /// @returns An isl_set describing the extent of the array.
-  isl::set getExtent(ScopArrayInfo *Array) {
-    unsigned NumDims = Array->getNumberOfDimensions();
-
-    if (Array->getNumberOfDimensions() == 0)
-      return isl::set::universe(Array->getSpace());
-
-    isl::union_map Accesses = S->getAccesses(Array);
-    isl::union_set AccessUSet = Accesses.range();
-    AccessUSet = AccessUSet.coalesce();
-    AccessUSet = AccessUSet.detect_equalities();
-    AccessUSet = AccessUSet.coalesce();
-
-    if (AccessUSet.is_empty())
-      return isl::set::empty(Array->getSpace());
-
-    isl::set AccessSet = AccessUSet.extract_set(Array->getSpace());
-
-    isl::local_space LS = isl::local_space(Array->getSpace());
-
-    isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, 0);
-    isl::pw_aff OuterMin = AccessSet.dim_min(0);
-    isl::pw_aff OuterMax = AccessSet.dim_max(0);
-    OuterMin = OuterMin.add_dims(isl::dim::in,
-                                 unsignedFromIslSize(Val.dim(isl::dim::in)));
-    OuterMax = OuterMax.add_dims(isl::dim::in,
-                                 unsignedFromIslSize(Val.dim(isl::dim::in)));
-    OuterMin = OuterMin.set_tuple_id(isl::dim::in, Array->getBasePtrId());
-    OuterMax = OuterMax.set_tuple_id(isl::dim::in, Array->getBasePtrId());
-
-    isl::set Extent = isl::set::universe(Array->getSpace());
-
-    Extent = Extent.intersect(OuterMin.le_set(Val));
-    Extent = Extent.intersect(OuterMax.ge_set(Val));
-
-    for (unsigned i = 1; i < NumDims; ++i)
-      Extent = Extent.lower_bound_si(isl::dim::set, i, 0);
-
-    for (unsigned i = 0; i < NumDims; ++i) {
-      isl::pw_aff PwAff = Array->getDimensionSizePw(i);
-
-      // isl_pw_aff can be NULL for zero dimension. Only in the case of a
-      // Fortran array will we have a legitimate dimension.
-      if (PwAff.is_null()) {
-        assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension");
-        continue;
-      }
-
-      isl::pw_aff Val = isl::aff::var_on_domain(
-          isl::local_space(Array->getSpace()), isl::dim::set, i);
-      PwAff = PwAff.add_dims(isl::dim::in,
-                             unsignedFromIslSize(Val.dim(isl::dim::in)));
-      PwAff = PwAff.set_tuple_id(isl::dim::in, Val.get_tuple_id(isl::dim::in));
-      isl::set Set = PwAff.gt_set(Val);
-      Extent = Set.intersect(Extent);
-    }
-
-    return Extent;
-  }
-
-  /// Derive the bounds of an array.
-  ///
-  /// For the first dimension we derive the bound of the array from the extent
-  /// of this dimension. For inner dimensions we obtain their size directly from
-  /// ScopArrayInfo.
-  ///
-  /// @param PPCGArray The array to compute bounds for.
-  /// @param Array The polly array from which to take the information.
-  void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
-    std::vector<isl_pw_aff *> Bounds;
-
-    if (PPCGArray.n_index > 0) {
-      if (isl_set_is_empty(PPCGArray.extent)) {
-        isl_set *Dom = isl_set_copy(PPCGArray.extent);
-        isl_local_space *LS = isl_local_space_from_space(
-            isl_space_params(isl_set_get_space(Dom)));
-        isl_set_free(Dom);
-        isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS));
-        Bounds.push_back(Zero);
-      } else {
-        isl_set *Dom = isl_set_copy(PPCGArray.extent);
-        Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
-        isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0);
-        isl_set_free(Dom);
-        Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound));
-        isl_local_space *LS =
-            isl_local_space_from_space(isl_set_get_space(Dom));
-        isl_aff *One = isl_aff_zero_on_domain(LS);
-        One = isl_aff_add_constant_si(One, 1);
-        Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
-        Bound = isl_pw_aff_gist(Bound, S->getContext().release());
-        Bounds.push_back(Bound);
-      }
-    }
-
-    for (unsigned i = 1; i < PPCGArray.n_index; ++i) {
-      isl_pw_aff *Bound = Array->getDimensionSizePw(i).release();
-      auto LS = isl_pw_aff_get_domain_space(Bound);
-      auto Aff = isl_multi_aff_zero(LS);
-
-      // We need types to work out, which is why we perform this weird dance
-      // with `Aff` and `Bound`. Consider this example:
-
-      // LS: [p] -> { [] }
-      // Zero: [p] -> { [] } | Implicitly, is [p] -> { ~ -> [] }.
-      // This `~` is used to denote a "null space" (which is different from
-      // a *zero dimensional* space), which is something that ISL does not
-      // show you when pretty printing.
-
-      // Bound: [p] -> { [] -> [(10p)] } | Here, the [] is a *zero dimensional*
-      // space, not a "null space" which does not exist at all.
-
-      // When we pullback (precompose) `Bound` with `Zero`, we get:
-      // Bound . Zero =
-      //     ([p] -> { [] -> [(10p)] }) . ([p] -> {~ -> [] }) =
-      //     [p] -> { ~ -> [(10p)] } =
-      //     [p] -> [(10p)] (as ISL pretty prints it)
-      // Bound Pullback: [p] -> { [(10p)] }
-
-      // We want this kind of an expression for Bound, without a
-      // zero dimensional input, but with a "null space" input for the types
-      // to work out later on, as far as I (Siddharth Bhat) understand.
-      // I was unable to find a reference to this in the ISL manual.
-      // References: Tobias Grosser.
-
-      Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
-      Bounds.push_back(Bound);
-    }
-
-    /// To construct a `isl_multi_pw_aff`, we need all the indivisual `pw_aff`
-    /// to have the same parameter dimensions. So, we need to align them to an
-    /// appropriate space.
-    /// Scop::Context is _not_ an appropriate space, because when we have
-    /// `-polly-ignore-parameter-bounds` enabled, the Scop::Context does not
-    /// contain all parameter dimensions.
-    /// So, use the helper `alignPwAffs` to align all the `isl_pw_aff` together.
-    isl_space *SeedAlignSpace = S->getParamSpace().release();
-    SeedAlignSpace = isl_space_add_dims(SeedAlignSpace, isl_dim_set, 1);
-
-    isl_space *AlignSpace = nullptr;
-    std::vector<isl_pw_aff *> AlignedBounds;
-    std::tie(AlignSpace, AlignedBounds) =
-        alignPwAffs(std::move(Bounds), SeedAlignSpace);
-
-    assert(AlignSpace && "alignPwAffs did not initialise AlignSpace");
-
-    isl_pw_aff_list *BoundsList =
-        createPwAffList(S->getIslCtx().get(), std::move(AlignedBounds));
-
-    isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent);
-    BoundsSpace = isl_space_align_params(BoundsSpace, AlignSpace);
-
-    assert(BoundsSpace && "Unable to access space of array.");
-    assert(BoundsList && "Unable to access list of bounds.");
-
-    PPCGArray.bound =
-        isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList);
-    assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly.");
-  }
-
-  /// Create the arrays for @p PPCGProg.
-  ///
-  /// @param PPCGProg The program to compute the arrays for.
-  void createArrays(gpu_prog *PPCGProg,
-                    const SmallVector<ScopArrayInfo *, 4> &ValidSAIs) {
-    int i = 0;
-    for (auto &Array : ValidSAIs) {
-      std::string TypeName;
-      raw_string_ostream OS(TypeName);
-
-      OS << *Array->getElementType();
-      TypeName = OS.str();
-
-      gpu_array_info &PPCGArray = PPCGProg->array[i];
-
-      PPCGArray.space = Array->getSpace().release();
-      PPCGArray.type = strdup(TypeName.c_str());
-      PPCGArray.size = DL->getTypeAllocSize(Array->getElementType());
-      PPCGArray.name = strdup(Array->getName().c_str());
-      PPCGArray.extent = nullptr;
-      PPCGArray.n_index = Array->getNumberOfDimensions();
-      PPCGArray.extent = getExtent(Array).release();
-      PPCGArray.n_ref = 0;
-      PPCGArray.refs = nullptr;
-      PPCGArray.accessed = true;
-      PPCGArray.read_only_scalar =
-          Array->isReadOnly() && Array->getNumberOfDimensions() == 0;
-      PPCGArray.has_compound_element = false;
-      PPCGArray.local = false;
-      PPCGArray.declare_local = false;
-      PPCGArray.global = false;
-      PPCGArray.linearize = false;
-      PPCGArray.dep_order = nullptr;
-      PPCGArray.user = Array;
-
-      PPCGArray.bound = nullptr;
-      setArrayBounds(PPCGArray, Array);
-      i++;
-
-      collect_references(PPCGProg, &PPCGArray);
-      PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray);
-    }
-  }
-
-  /// Create an identity map between the arrays in the scop.
-  ///
-  /// @returns An identity map between the arrays in the scop.
-  isl_union_map *getArrayIdentity() {
-    isl_union_map *Maps = isl_union_map_empty(S->getParamSpace().release());
-
-    for (auto &Array : S->arrays()) {
-      isl_space *Space = Array->getSpace().release();
-      Space = isl_space_map_from_set(Space);
-      isl_map *Identity = isl_map_identity(Space);
-      Maps = isl_union_map_add_map(Maps, Identity);
-    }
-
-    return Maps;
-  }
-
-  /// Create a default-initialized PPCG GPU program.
-  ///
-  /// @returns A new gpu program description.
-  gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) {
-
-    if (!PPCGScop)
-      return nullptr;
-
-    auto PPCGProg = isl_calloc_type(S->getIslCtx().get(), struct gpu_prog);
-
-    PPCGProg->ctx = S->getIslCtx().get();
-    PPCGProg->scop = PPCGScop;
-    PPCGProg->context = isl_set_copy(PPCGScop->context);
-    PPCGProg->read = isl_union_map_copy(PPCGScop->reads);
-    PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes);
-    PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes);
-    PPCGProg->tagged_must_kill =
-        isl_union_map_copy(PPCGScop->tagged_must_kills);
-    PPCGProg->to_inner = getArrayIdentity();
-    PPCGProg->to_outer = getArrayIdentity();
-    // TODO: verify that this assignment is correct.
-    PPCGProg->any_to_outer = nullptr;
-    PPCGProg->n_stmts = std::distance(S->begin(), S->end());
-    PPCGProg->stmts = getStatements();
-
-    // Only consider arrays that have a non-empty extent.
-    // Otherwise, this will cause us to consider the following kinds of
-    // empty arrays:
-    //     1. Invariant loads that are represented by SAI objects.
-    //     2. Arrays with statically known zero size.
-    auto ValidSAIsRange =
-        make_filter_range(S->arrays(), [this](ScopArrayInfo *SAI) -> bool {
-          return !getExtent(SAI).is_empty();
-        });
-    SmallVector<ScopArrayInfo *, 4> ValidSAIs(ValidSAIsRange.begin(),
-                                              ValidSAIsRange.end());
-
-    PPCGProg->n_array =
-        ValidSAIs.size(); // std::distance(S->array_begin(), S->array_end());
-    PPCGProg->array = isl_calloc_array(
-        S->getIslCtx().get(), struct gpu_array_info, PPCGProg->n_array);
-
-    createArrays(PPCGProg, ValidSAIs);
-
-    PPCGProg->array_order = nullptr;
-    collect_order_dependences(PPCGProg);
-
-    PPCGProg->may_persist = compute_may_persist(PPCGProg);
-    return PPCGProg;
-  }
-
-  struct PrintGPUUserData {
-    struct cuda_info *CudaInfo;
-    struct gpu_prog *PPCGProg;
-    std::vector<ppcg_kernel *> Kernels;
-  };
-
-  /// Print a user statement node in the host code.
-  ///
-  /// We use ppcg's printing facilities to print the actual statement and
-  /// additionally build up a list of all kernels that are encountered in the
-  /// host ast.
-  ///
-  /// @param P The printer to print to
-  /// @param Options The printing options to use
-  /// @param Node The node to print
-  /// @param User A user pointer to carry additional data. This pointer is
-  ///             expected to be of type PrintGPUUserData.
-  ///
-  /// @returns A printer to which the output has been printed.
-  static __isl_give isl_printer *
-  printHostUser(__isl_take isl_printer *P,
-                __isl_take isl_ast_print_options *Options,
-                __isl_take isl_ast_node *Node, void *User) {
-    auto Data = (struct PrintGPUUserData *)User;
-    auto Id = isl_ast_node_get_annotation(Node);
-
-    if (Id) {
-      bool IsUser = !strcmp(isl_id_get_name(Id), "user");
-
-      // If this is a user statement, format it ourselves as ppcg would
-      // otherwise try to call pet functionality that is not available in
-      // Polly.
-      if (IsUser) {
-        P = isl_printer_start_line(P);
-        P = isl_printer_print_ast_node(P, Node);
-        P = isl_printer_end_line(P);
-        isl_id_free(Id);
-        isl_ast_print_options_free(Options);
-        return P;
-      }
-
-      auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
-      isl_id_free(Id);
-      Data->Kernels.push_back(Kernel);
-    }
-
-    return print_host_user(P, Options, Node, User);
-  }
-
-  /// Print C code corresponding to the control flow in @p Kernel.
-  ///
-  /// @param Kernel The kernel to print
-  void printKernel(ppcg_kernel *Kernel) {
-    auto *P = isl_printer_to_str(S->getIslCtx().get());
-    P = isl_printer_set_output_format(P, ISL_FORMAT_C);
-    auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get());
-    P = isl_ast_node_print(Kernel->tree, P, Options);
-    char *String = isl_printer_get_str(P);
-    outs() << String << "\n";
-    free(String);
-    isl_printer_free(P);
-  }
-
-  /// Print C code corresponding to the GPU code described by @p Tree.
-  ///
-  /// @param Tree An AST describing GPU code
-  /// @param PPCGProg The PPCG program from which @Tree has been constructed.
-  void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
-    auto *P = isl_printer_to_str(S->getIslCtx().get());
-    P = isl_printer_set_output_format(P, ISL_FORMAT_C);
-
-    PrintGPUUserData Data;
-    Data.PPCGProg = PPCGProg;
-
-    auto *Options = isl_ast_print_options_alloc(S->getIslCtx().get());
-    Options =
-        isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
-    P = isl_ast_node_print(Tree, P, Options);
-    char *String = isl_printer_get_str(P);
-    outs() << "# host\n";
-    outs() << String << "\n";
-    free(String);
-    isl_printer_free(P);
-
-    for (auto Kernel : Data.Kernels) {
-      outs() << "# kernel" << Kernel->id << "\n";
-      printKernel(Kernel);
-    }
-  }
-
-  // Generate a GPU program using PPCG.
-  //
-  // GPU mapping consists of multiple steps:
-  //
-  //  1) Compute new schedule for the program.
-  //  2) Map schedule to GPU (TODO)
-  //  3) Generate code for new schedule (TODO)
-  //
-  // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
-  // is mostly CPU specific. Instead, we use PPCG's GPU code generation
-  // strategy directly from this pass.
-  gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) {
-
-    auto PPCGGen = isl_calloc_type(S->getIslCtx().get(), struct gpu_gen);
-
-    PPCGGen->ctx = S->getIslCtx().get();
-    PPCGGen->options = PPCGScop->options;
-    PPCGGen->print = nullptr;
-    PPCGGen->print_user = nullptr;
-    PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt;
-    PPCGGen->prog = PPCGProg;
-    PPCGGen->tree = nullptr;
-    PPCGGen->types.n = 0;
-    PPCGGen->types.name = nullptr;
-    PPCGGen->sizes = nullptr;
-    PPCGGen->used_sizes = nullptr;
-    PPCGGen->kernel_id = 0;
-
-    // Set scheduling strategy to same strategy PPCG is using.
-    isl_options_set_schedule_serialize_sccs(PPCGGen->ctx, false);
-    isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true);
-    isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true);
-    isl_options_set_schedule_whole_component(PPCGGen->ctx, false);
-
-    isl_schedule *Schedule = get_schedule(PPCGGen);
-
-    int has_permutable = has_any_permutable_node(Schedule);
-
-    Schedule =
-        isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
-
-    if (!has_permutable || has_permutable < 0) {
-      Schedule = isl_schedule_free(Schedule);
-      LLVM_DEBUG(dbgs() << getUniqueScopName(S)
-                        << " does not have permutable bands. Bailing out\n";);
-    } else {
-      const bool CreateTransferToFromDevice = !PollyManagedMemory;
-      Schedule = map_to_device(PPCGGen, Schedule, CreateTransferToFromDevice);
-      PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
-    }
-
-    if (DumpSchedule) {
-      isl_printer *P = isl_printer_to_str(S->getIslCtx().get());
-      P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
-      P = isl_printer_print_str(P, "Schedule\n");
-      P = isl_printer_print_str(P, "========\n");
-      if (Schedule)
-        P = isl_printer_print_schedule(P, Schedule);
-      else
-        P = isl_printer_print_str(P, "No schedule found\n");
-
-      outs() << isl_printer_get_str(P) << "\n";
-      isl_printer_free(P);
-    }
-
-    if (DumpCode) {
-      outs() << "Code\n";
-      outs() << "====\n";
-      if (PPCGGen->tree)
-        printGPUTree(PPCGGen->tree, PPCGProg);
-      else
-        outs() << "No code generated\n";
-    }
-
-    isl_schedule_free(Schedule);
-
-    return PPCGGen;
-  }
-
-  /// Free gpu_gen structure.
-  ///
-  /// @param PPCGGen The ppcg_gen object to free.
-  void freePPCGGen(gpu_gen *PPCGGen) {
-    isl_ast_node_free(PPCGGen->tree);
-    isl_union_map_free(PPCGGen->sizes);
-    isl_union_map_free(PPCGGen->used_sizes);
-    free(PPCGGen);
-  }
-
-  /// Free the options in the ppcg scop structure.
-  ///
-  /// ppcg is not freeing these options for us. To avoid leaks we do this
-  /// ourselves.
-  ///
-  /// @param PPCGScop The scop referencing the options to free.
-  void freeOptions(ppcg_scop *PPCGScop) {
-    free(PPCGScop->options->debug);
-    PPCGScop->options->debug = nullptr;
-    free(PPCGScop->options);
-    PPCGScop->options = nullptr;
-  }
-
-  /// Approximate the number of points in the set.
-  ///
-  /// This function returns an ast expression that overapproximates the number
-  /// of points in an isl set through the rectangular hull surrounding this set.
-  ///
-  /// @param Set   The set to count.
-  /// @param Build The isl ast build object to use for creating the ast
-  ///              expression.
-  ///
-  /// @returns An approximation of the number of points in the set.
-  __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
-                                             __isl_keep isl_ast_build *Build) {
-
-    isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
-    auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
-
-    isl_space *Space = isl_set_get_space(Set);
-    Space = isl_space_params(Space);
-    auto *Univ = isl_set_universe(Space);
-    isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
-
-    for (long i = 0, n = isl_set_dim(Set, isl_dim_set); i < n; i++) {
-      isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
-      isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
-      isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
-      DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
-      auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
-      Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
-    }
-
-    isl_set_free(Set);
-    isl_pw_aff_free(OneAff);
-
-    return Expr;
-  }
-
-  /// Approximate a number of dynamic instructions executed by a given
-  /// statement.
-  ///
-  /// @param Stmt  The statement for which to compute the number of dynamic
-  ///              instructions.
-  /// @param Build The isl ast build object to use for creating the ast
-  ///              expression.
-  /// @returns An approximation of the number of dynamic instructions executed
-  ///          by @p Stmt.
-  __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
-                                             __isl_keep isl_ast_build *Build) {
-    auto Iterations = approxPointsInSet(Stmt.getDomain().release(), Build);
-
-    long InstCount = 0;
-
-    if (Stmt.isBlockStmt()) {
-      auto *BB = Stmt.getBasicBlock();
-      InstCount = std::distance(BB->begin(), BB->end());
-    } else {
-      auto *R = Stmt.getRegion();
-
-      for (auto *BB : R->blocks()) {
-        InstCount += std::distance(BB->begin(), BB->end());
-      }
-    }
-
-    isl_val *InstVal = isl_val_int_from_si(S->getIslCtx().get(), InstCount);
-    auto *InstExpr = isl_ast_expr_from_val(InstVal);
-    return isl_ast_expr_mul(InstExpr, Iterations);
-  }
-
-  /// Approximate dynamic instructions executed in scop.
-  ///
-  /// @param S     The scop for which to approximate dynamic instructions.
-  /// @param Build The isl ast build object to use for creating the ast
-  ///              expression.
-  /// @returns An approximation of the number of dynamic instructions executed
-  ///          in @p S.
-  __isl_give isl_ast_expr *
-  getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
-    isl_ast_expr *Instructions;
-
-    isl_val *Zero = isl_val_int_from_si(S.getIslCtx().get(), 0);
-    Instructions = isl_ast_expr_from_val(Zero);
-
-    for (ScopStmt &Stmt : S) {
-      isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
-      Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
-    }
-    return Instructions;
-  }
-
-  /// Create a check that ensures sufficient compute in scop.
-  ///
-  /// @param S     The scop for which to ensure sufficient compute.
-  /// @param Build The isl ast build object to use for creating the ast
-  ///              expression.
-  /// @returns An expression that evaluates to TRUE in case of sufficient
-  ///          compute and to FALSE, otherwise.
-  __isl_give isl_ast_expr *
-  createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
-    auto Iterations = getNumberOfIterations(S, Build);
-    auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx().get(), MinCompute);
-    auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
-    return isl_ast_expr_ge(Iterations, MinComputeExpr);
-  }
-
-  /// Check if the basic block contains a function we cannot codegen for GPU
-  /// kernels.
-  ///
-  /// If this basic block does something with a `Function` other than calling
-  /// a function that we support in a kernel, return true.
-  bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB,
-                                            bool AllowCUDALibDevice) {
-    for (const Instruction &Inst : *BB) {
-      const CallInst *Call = dyn_cast<CallInst>(&Inst);
-      if (Call && isValidFunctionInKernel(Call->getCalledFunction(),
-                                          AllowCUDALibDevice))
-        continue;
-
-      for (Value *Op : Inst.operands())
-        // Look for functions among operands of Inst.
-        if (isa<Function>(Op->stripPointerCasts())) {
-          LLVM_DEBUG(dbgs()
-                     << Inst << " has illegal use of function in kernel.\n");
-          return true;
-        }
-    }
-    return false;
-  }
-
-  /// Return whether the Scop S uses functions in a way that we do not support.
-  bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) {
-    for (auto &Stmt : S) {
-      if (Stmt.isBlockStmt()) {
-        if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(),
-                                                 AllowCUDALibDevice))
-          return true;
-      } else {
-        assert(Stmt.isRegionStmt() &&
-               "Stmt was neither block nor region statement");
-        for (const BasicBlock *BB : Stmt.getRegion()->blocks())
-          if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice))
-            return true;
-      }
-    }
-    return false;
-  }
-
-  /// Generate code for a given GPU AST described by @p Root.
-  ///
-  /// @param Root An isl_ast_node pointing to the root of the GPU AST.
-  /// @param Prog The GPU Program to generate code for.
-  void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
-    ScopAnnotator Annotator;
-    Annotator.buildAliasScopes(*S);
-
-    Region *R = &S->getRegion();
-
-    simplifyRegion(R, DT, LI, RI);
-
-    BasicBlock *EnteringBB = R->getEnteringBlock();
-
-    PollyIRBuilder Builder(EnteringBB->getContext(), ConstantFolder(),
-                           IRInserter(Annotator));
-    Builder.SetInsertPoint(EnteringBB->getTerminator());
-
-    // Only build the run-time condition and parameters _after_ having
-    // introduced the conditional branch. This is important as the conditional
-    // branch will guard the original scop from new induction variables that
-    // the SCEVExpander may introduce while code generating the parameters and
-    // which may introduce scalar dependences that prevent us from correctly
-    // code generating this scop.
-    BBPair StartExitBlocks;
-    BranchInst *CondBr = nullptr;
-    std::tie(StartExitBlocks, CondBr) =
-        executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
-    BasicBlock *StartBlock = std::get<0>(StartExitBlocks);
-
-    assert(CondBr && "CondBr not initialized by executeScopConditionally");
-
-    GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
-                               StartBlock, Prog, Runtime, Architecture);
-
-    // TODO: Handle LICM
-    auto SplitBlock = StartBlock->getSinglePredecessor();
-    Builder.SetInsertPoint(SplitBlock->getTerminator());
-
-    isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx().get());
-    isl::ast_expr Condition =
-        IslAst::buildRunCondition(*S, isl::manage_copy(Build));
-    isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
-    Condition =
-        isl::manage(isl_ast_expr_and(Condition.release(), SufficientCompute));
-    isl_ast_build_free(Build);
-
-    // preload invariant loads. Note: This should happen before the RTC
-    // because the RTC may depend on values that are invariant load hoisted.
-    if (!NodeBuilder.preloadInvariantLoads()) {
-      // Patch the introduced branch condition to ensure that we always execute
-      // the original SCoP.
-      auto *FalseI1 = Builder.getFalse();
-      auto *SplitBBTerm = Builder.GetInsertBlock()->getTerminator();
-      SplitBBTerm->setOperand(0, FalseI1);
-
-      LLVM_DEBUG(dbgs() << "preloading invariant loads failed in function: " +
-                               S->getFunction().getName() +
-                               " | Scop Region: " + S->getNameStr());
-      // adjust the dominator tree accordingly.
-      auto *ExitingBlock = StartBlock->getUniqueSuccessor();
-      assert(ExitingBlock);
-      auto *MergeBlock = ExitingBlock->getUniqueSuccessor();
-      assert(MergeBlock);
-      polly::markBlockUnreachable(*StartBlock, Builder);
-      polly::markBlockUnreachable(*ExitingBlock, Builder);
-      auto *ExitingBB = S->getExitingBlock();
-      assert(ExitingBB);
-
-      DT->changeImmediateDominator(MergeBlock, ExitingBB);
-      DT->eraseNode(ExitingBlock);
-      isl_ast_node_free(Root);
-    } else {
-
-      if (polly::PerfMonitoring) {
-        PerfMonitor P(*S, EnteringBB->getParent()->getParent());
-        P.initialize();
-        P.insertRegionStart(SplitBlock->getTerminator());
-
-        // TODO: actually think if this is the correct exiting block to place
-        // the `end` performance marker. Invariant load hoisting changes
-        // the CFG in a way that I do not precisely understand, so I
-        // (Siddharth<siddu.druid@gmail.com>) should come back to this and
-        // think about which exiting block to use.
-        auto *ExitingBlock = StartBlock->getUniqueSuccessor();
-        assert(ExitingBlock);
-        BasicBlock *MergeBlock = ExitingBlock->getUniqueSuccessor();
-        P.insertRegionEnd(MergeBlock->getTerminator());
-      }
-
-      NodeBuilder.addParameters(S->getContext().release());
-      Value *RTC = NodeBuilder.createRTC(Condition.release());
-      Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
-
-      Builder.SetInsertPoint(&*StartBlock->begin());
-
-      NodeBuilder.create(Root);
-    }
-
-    /// In case a sequential kernel has more surrounding loops as any parallel
-    /// kernel, the SCoP is probably mostly sequential. Hence, there is no
-    /// point in running it on a GPU.
-    if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel)
-      CondBr->setOperand(0, Builder.getFalse());
-
-    if (!NodeBuilder.BuildSuccessful)
-      CondBr->setOperand(0, Builder.getFalse());
-  }
-
-  bool runOnScop(Scop &CurrentScop) override {
-    S = &CurrentScop;
-    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
-    RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
-
-    LLVM_DEBUG(dbgs() << "PPCGCodeGen running on : " << getUniqueScopName(S)
-                      << " | loop depth: " << S->getMaxLoopDepth() << "\n");
-
-    // We currently do not support functions other than intrinsics inside
-    // kernels, as code generation will need to offload function calls to the
-    // kernel. This may lead to a kernel trying to call a function on the host.
-    // This also allows us to prevent codegen from trying to take the
-    // address of an intrinsic function to send to the kernel.
-    if (containsInvalidKernelFunction(CurrentScop,
-                                      Architecture == GPUArch::NVPTX64)) {
-      LLVM_DEBUG(
-          dbgs() << getUniqueScopName(S)
-                 << " contains function which cannot be materialised in a GPU "
-                    "kernel. Bailing out.\n";);
-      return false;
-    }
-
-    auto PPCGScop = createPPCGScop();
-    auto PPCGProg = createPPCGProg(PPCGScop);
-    auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
-
-    if (PPCGGen->tree) {
-      generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
-      CurrentScop.markAsToBeSkipped();
-    } else {
-      LLVM_DEBUG(dbgs() << getUniqueScopName(S)
-                        << " has empty PPCGGen->tree. Bailing out.\n");
-    }
-
-    freeOptions(PPCGScop);
-    freePPCGGen(PPCGGen);
-    gpu_prog_free(PPCGProg);
-    ppcg_scop_free(PPCGScop);
-
-    return true;
-  }
-
-  void printScop(raw_ostream &, Scop &) const override {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<RegionInfoPass>();
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.addRequired<ScopDetectionWrapperPass>();
-    AU.addRequired<ScopInfoRegionPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-
-    // FIXME: We do not yet add regions for the newly generated code to the
-    //        region tree.
-  }
-};
-} // namespace
-
-char PPCGCodeGeneration::ID = 1;
-
-Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
-  PPCGCodeGeneration *generator = new PPCGCodeGeneration();
-  generator->Runtime = Runtime;
-  generator->Architecture = Arch;
-  return generator;
-}
-
-INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
-                      "Polly - Apply PPCG translation to SCOP", false, false)
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
-INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg",
-                    "Polly - Apply PPCG translation to SCOP", false, false)
diff --git a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp
--- a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp
+++ b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/CodeGen/RuntimeDebugBuilder.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
 #include <string>
 #include <vector>
@@ -17,6 +16,16 @@
 using namespace llvm;
 using namespace polly;
 
+llvm::Value *RuntimeDebugBuilder::getPrintableString(PollyIRBuilder &Builder,
+                                                     llvm::StringRef Str) {
+  // FIXME: addressspace(4) is a marker for a string (for the %s conversion
+  // specifier) but should be using the default address space. This only works
+  // because CPU backends typically ignore the address space. For constant
+  // strings as returned by getPrintableString, the format string should instead
+  // directly spell out the string.
+  return Builder.CreateGlobalStringPtr(Str, "", 4);
+}
+
 Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   const char *Name = "vprintf";
@@ -33,72 +42,9 @@
   return F;
 }
 
-Function *RuntimeDebugBuilder::getAddressSpaceCast(PollyIRBuilder &Builder,
-                                                   unsigned Src, unsigned Dst,
-                                                   unsigned SrcBits,
-                                                   unsigned DstBits) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  auto Name = std::string("llvm.nvvm.ptr.constant.to.gen.p") +
-              std::to_string(Dst) + "i" + std::to_string(DstBits) + ".p" +
-              std::to_string(Src) + "i" + std::to_string(SrcBits);
-  Function *F = M->getFunction(Name);
-
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    FunctionType *Ty = FunctionType::get(
-        PointerType::get(Builder.getIntNTy(DstBits), Dst),
-        PointerType::get(Builder.getIntNTy(SrcBits), Src), false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  return F;
-}
-
-std::vector<Value *>
-RuntimeDebugBuilder::getGPUThreadIdentifiers(PollyIRBuilder &Builder) {
-  std::vector<Value *> Identifiers;
-
-  auto M = Builder.GetInsertBlock()->getParent()->getParent();
-
-  std::vector<Function *> BlockIDs = {
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_x),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_y),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_z),
-  };
-
-  Identifiers.push_back(Builder.CreateGlobalStringPtr("> block-id: ", "", 4));
-  for (auto GetID : BlockIDs) {
-    Value *Id = Builder.CreateCall(GetID, {});
-    Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
-    Identifiers.push_back(Id);
-    Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
-  }
-
-  Identifiers.push_back(Builder.CreateGlobalStringPtr("| ", "", 4));
-
-  std::vector<Function *> ThreadIDs = {
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_x),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_y),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_z),
-  };
-
-  Identifiers.push_back(Builder.CreateGlobalStringPtr("thread-id: ", "", 4));
-  for (auto GetId : ThreadIDs) {
-    Value *Id = Builder.CreateCall(GetId, {});
-    Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
-    Identifiers.push_back(Id);
-    Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
-  }
-
-  return Identifiers;
-}
-
-void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, bool IsGPU,
+void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder,
                                         ArrayRef<Value *> Values) {
-  if (IsGPU)
-    createGPUPrinterT(Builder, Values);
-  else
-    createCPUPrinterT(Builder, Values);
+  createCPUPrinterT(Builder, Values);
 }
 
 bool RuntimeDebugBuilder::isPrintable(Type *Ty) {
@@ -169,78 +115,6 @@
   createFlush(Builder);
 }
 
-void RuntimeDebugBuilder::createGPUPrinterT(PollyIRBuilder &Builder,
-                                            ArrayRef<Value *> Values) {
-  std::string str;
-
-  auto *Zero = Builder.getInt64(0);
-
-  auto ToPrint = getGPUThreadIdentifiers(Builder);
-
-  ToPrint.push_back(Builder.CreateGlobalStringPtr("\n  ", "", 4));
-  ToPrint.insert(ToPrint.end(), Values.begin(), Values.end());
-
-  const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
-
-  // Allocate print buffer (assuming 2*32 bit per element)
-  auto T = ArrayType::get(Builder.getInt32Ty(), ToPrint.size() * 2);
-  Value *Data = new AllocaInst(
-      T, DL.getAllocaAddrSpace(), "polly.vprint.buffer",
-      &Builder.GetInsertBlock()->getParent()->getEntryBlock().front());
-  auto *DataPtr = Builder.CreateGEP(T, Data, {Zero, Zero});
-
-  int Offset = 0;
-  for (auto Val : ToPrint) {
-    auto Ptr = Builder.CreateGEP(Builder.getInt32Ty(), DataPtr,
-                                 Builder.getInt64(Offset));
-    Type *Ty = Val->getType();
-
-    if (Ty->isFloatingPointTy()) {
-      if (!Ty->isDoubleTy())
-        Val = Builder.CreateFPExt(Val, Builder.getDoubleTy());
-    } else if (Ty->isIntegerTy()) {
-      if (Ty->getIntegerBitWidth() < 64) {
-        Val = Builder.CreateSExt(Val, Builder.getInt64Ty());
-      } else {
-        assert(Ty->getIntegerBitWidth() == 64 &&
-               "Integer types larger 64 bit not supported");
-        // fallthrough
-      }
-    } else if (isa<PointerType>(Ty)) {
-      if (Ty == Builder.getInt8PtrTy(4)) {
-        // Pointers in constant address space are printed as strings
-        Val = Builder.CreateGEP(Builder.getInt8Ty(), Val, Builder.getInt64(0));
-        auto F = RuntimeDebugBuilder::getAddressSpaceCast(Builder, 4, 0);
-        Val = Builder.CreateCall(F, Val);
-      } else {
-        Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty());
-      }
-    } else {
-      llvm_unreachable("Unknown type");
-    }
-
-    Ty = Val->getType();
-    Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Ty->getPointerTo(5));
-    Builder.CreateAlignedStore(Val, Ptr, Align(4));
-
-    if (Ty->isFloatingPointTy())
-      str += "%f";
-    else if (Ty->isIntegerTy())
-      str += "%ld";
-    else
-      str += "%s";
-
-    Offset += 2;
-  }
-
-  Value *Format = Builder.CreateGlobalStringPtr(str, "polly.vprintf.buffer", 4);
-  Format = Builder.CreateCall(getAddressSpaceCast(Builder, 4, 0), Format);
-
-  Data = Builder.CreateBitCast(Data, Builder.getInt8PtrTy());
-
-  Builder.CreateCall(getVPrintF(Builder), {Format, Data});
-}
-
 Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   const char *Name = "printf";
diff --git a/polly/lib/External/CMakeLists.txt b/polly/lib/External/CMakeLists.txt
--- a/polly/lib/External/CMakeLists.txt
+++ b/polly/lib/External/CMakeLists.txt
@@ -314,91 +314,3 @@
   target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
   target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
 endif (POLLY_BUNDLED_ISL)
-
-
-# External: Polyhedral Parallel Code Generator
-if (GPU_CODEGEN)
-  set(PET_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pet")
-  set(PPCG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ppcg")
-  set(PPCG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/ppcg")
-
-  # Determine version of ppcg
-  if (EXISTS "${PPCG_SOURCE_DIR}/GIT_HEAD_ID")
-    # The source comes from a 'make dist' archive
-    file(READ "${PPCG_SOURCE_DIR}/GIT_HEAD_ID" PPCG_GIT_HEAD_ID)
-    string(STRIP "${PPCG_GIT_HEAD_ID}" PPCG_GIT_HEAD_ID)
-  elseif (EXISTS "${PPCG_SOURCE_DIR}/gitversion.h")
-    # The source directory is preconfigured
-    file(READ "${PPCG_SOURCE_DIR}/gitversion.h" GITVERSION_H)
-    string(REGEX REPLACE ".*\\\"([^\\\"]*)\\\".*" "\\1" PPCG_GIT_HEAD_ID "${GITVERSION_H}")
-  elseif ()
-    # Unknown revision
-    # TODO: We could look for a .git and get the revision from HEAD
-    set(PPCG_GIT_HEAD_ID "UNKNOWN")
-  endif ()
-
-  message(STATUS "PPCG version: ${PPCG_GIT_HEAD_ID}")
-
-  set (PPCG_FILES
-       ppcg/cuda.c
-       ppcg/cuda_common.c
-       ppcg/external.c
-       ppcg/gpu_array_tile.c
-       ppcg/gpu.c
-       ppcg/gpu_array_tile.c
-       ppcg/gpu_group.c
-       ppcg/gpu_hybrid.c
-       ppcg/gpu_print.c
-       ppcg/gpu_tree.c
-       ppcg/grouping.c
-       ppcg/hybrid.c
-       ppcg/ppcg.c
-       ppcg/ppcg_options.c
-       ppcg/print.c
-       ppcg/schedule.c
-       ppcg/util.c
-       )
-
-  include_directories(BEFORE
-    ${PPCG_BINARY_DIR}
-    ${PPCG_SOURCE_DIR}/imath
-    ${PPCG_SOURCE_DIR}/include
-    ${PET_SOURCE_DIR}/include
-  )
-
-  add_polly_library(PollyPPCG
-    ${PPCG_FILES}
-  )
-
-  target_link_libraries(PollyPPCG PUBLIC ${ISL_TARGET})
-
-  # Disable warnings for upstream projects.
-  if (MSVC)
-    set(DISABLE_WARNING_FLAGS
-      -wd4018 # 'expression' : signed/unsigned mismatch
-      -wd4090 # 'operation' : different 'modifier' qualifiers
-      -wd4200 # nonstandard extension used: zero-sized array in struct/union
-      -wd4201 # nonstandard extension used: nameless struct/union
-      -wd4334 # 'operator': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
-      -wd4221 # nonstandard extension used : 'identifier' : cannot be initialized using address of automatic variable
-    )
-    if (POLLY_BUNDLED_ISL)
-      target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
-      target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
-    endif (POLLY_BUNDLED_ISL)
-    target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
-  else ()
-    if (POLLY_BUNDLED_ISL)
-      set_target_properties(PollyISL polly-isl-test PROPERTIES COMPILE_FLAGS "-w")
-    endif (POLLY_BUNDLED_ISL)
-    set_target_properties(PollyPPCG PROPERTIES COMPILE_FLAGS "-w")
-  endif ()
-
-  if(MSVC)
-    # In the Windows API (with some exceptions), the maximum length for a path is
-    # MAX_PATH, which is defined as 260 characters.
-    target_compile_definitions(PollyPPCG PRIVATE "-DPATH_MAX=260")
-  endif ()
-
-  target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
-endif ()
diff --git a/polly/lib/External/pet/include/pet.h b/polly/lib/External/pet/include/pet.h
deleted file mode 100644
--- a/polly/lib/External/pet/include/pet.h
+++ /dev/null
@@ -1,622 +0,0 @@
-#ifndef PET_H
-#define PET_H
-
-#include <isl/aff.h>
-#include <isl/arg.h>
-#include <isl/ast_build.h>
-#include <isl/set.h>
-#include <isl/map.h>
-#include <isl/union_map.h>
-#include <isl/printer.h>
-#include <isl/id_to_ast_expr.h>
-#include <isl/id_to_pw_aff.h>
-#include <isl/schedule.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-struct pet_options;
-ISL_ARG_DECL(pet_options, struct pet_options, pet_options_args)
-
-/* Create an isl_ctx that references the pet options. */
-isl_ctx *isl_ctx_alloc_with_pet_options();
-
-/* If autodetect is set, any valid scop is extracted.
- * Otherwise, the scop needs to be delimited by pragmas.
- */
-int pet_options_set_autodetect(isl_ctx *ctx, int val);
-int pet_options_get_autodetect(isl_ctx *ctx);
-
-int pet_options_set_detect_conditional_assignment(isl_ctx *ctx, int val);
-int pet_options_get_detect_conditional_assignment(isl_ctx *ctx);
-
-/* If encapsulate-dynamic-control is set, then any dynamic control
- * in the input program will be encapsulated in macro statements.
- * This means in particular that no statements with arguments
- * will be created.
- */
-int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val);
-int pet_options_get_encapsulate_dynamic_control(isl_ctx *ctx);
-
-#define	PET_OVERFLOW_AVOID	0
-#define	PET_OVERFLOW_IGNORE	1
-int pet_options_set_signed_overflow(isl_ctx *ctx, int val);
-int pet_options_get_signed_overflow(isl_ctx *ctx);
-
-struct pet_loc;
-typedef struct pet_loc pet_loc;
-
-/* Return an additional reference to "loc". */
-__isl_give pet_loc *pet_loc_copy(__isl_keep pet_loc *loc);
-/* Free a reference to "loc". */
-pet_loc *pet_loc_free(__isl_take pet_loc *loc);
-
-/* Return the offset in the input file of the start of "loc". */
-unsigned pet_loc_get_start(__isl_keep pet_loc *loc);
-/* Return the offset in the input file of the character after "loc". */
-unsigned pet_loc_get_end(__isl_keep pet_loc *loc);
-/* Return the line number of a line within the "loc" region. */
-int pet_loc_get_line(__isl_keep pet_loc *loc);
-/* Return the indentation of the "loc" region. */
-__isl_keep const char *pet_loc_get_indent(__isl_keep pet_loc *loc);
-
-enum pet_expr_type {
-	pet_expr_error = -1,
-	pet_expr_access,
-	pet_expr_call,
-	pet_expr_cast,
-	pet_expr_int,
-	pet_expr_double,
-	pet_expr_op
-};
-
-enum pet_op_type {
-	/* only compound assignments operators before assignment */
-	pet_op_add_assign,
-	pet_op_sub_assign,
-	pet_op_mul_assign,
-	pet_op_div_assign,
-	pet_op_and_assign,
-	pet_op_xor_assign,
-	pet_op_or_assign,
-	pet_op_assign,
-	pet_op_add,
-	pet_op_sub,
-	pet_op_mul,
-	pet_op_div,
-	pet_op_mod,
-	pet_op_shl,
-	pet_op_shr,
-	pet_op_eq,
-	pet_op_ne,
-	pet_op_le,
-	pet_op_ge,
-	pet_op_lt,
-	pet_op_gt,
-	pet_op_minus,
-	pet_op_post_inc,
-	pet_op_post_dec,
-	pet_op_pre_inc,
-	pet_op_pre_dec,
-	pet_op_address_of,
-	pet_op_assume,
-	pet_op_kill,
-	pet_op_and,
-	pet_op_xor,
-	pet_op_or,
-	pet_op_not,
-	pet_op_land,
-	pet_op_lor,
-	pet_op_lnot,
-	pet_op_cond,
-	pet_op_last
-};
-
-/* Index into the pet_expr->args array when pet_expr->type == pet_expr_unary
- */
-enum pet_un_arg_type {
-	pet_un_arg
-};
-
-/* Indices into the pet_expr->args array when
- * pet_expr->type == pet_expr_binary
- */
-enum pet_bin_arg_type {
-	pet_bin_lhs,
-	pet_bin_rhs
-};
-
-/* Indices into the pet_expr->args array when
- * pet_expr->type == pet_expr_ternary
- */
-enum pet_ter_arg_type {
-	pet_ter_cond,
-	pet_ter_true,
-	pet_ter_false
-};
-
-struct pet_expr;
-typedef struct pet_expr pet_expr;
-
-/* Return an additional reference to "expr". */
-__isl_give pet_expr *pet_expr_copy(__isl_keep pet_expr *expr);
-/* Free a reference to "expr". */
-__isl_null pet_expr *pet_expr_free(__isl_take pet_expr *expr);
-
-/* Return the isl_ctx in which "expr" was created. */
-isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr);
-
-/* Return the type of "expr". */
-enum pet_expr_type pet_expr_get_type(__isl_keep pet_expr *expr);
-/* Return the number of arguments of "expr". */
-int pet_expr_get_n_arg(__isl_keep pet_expr *expr);
-/* Set the number of arguments of "expr" to "n". */
-__isl_give pet_expr *pet_expr_set_n_arg(__isl_take pet_expr *expr, int n);
-/* Return the argument of "expr" at position "pos". */
-__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos);
-/* Replace the argument of "expr" at position "pos" by "arg". */
-__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
-	__isl_take pet_expr *arg);
-
-/* Return the operation type of operation expression "expr". */
-enum pet_op_type pet_expr_op_get_type(__isl_keep pet_expr *expr);
-/* Replace the operation type of operation expression "expr" by "type". */
-__isl_give pet_expr *pet_expr_op_set_type(__isl_take pet_expr *expr,
-	enum pet_op_type type);
-
-/* Construct a (read) access pet_expr from an index expression. */
-__isl_give pet_expr *pet_expr_from_index(__isl_take isl_multi_pw_aff *index);
-
-/* Does "expr" represent an affine expression? */
-isl_bool pet_expr_is_affine(__isl_keep pet_expr *expr);
-/* Does the access expression "expr" read the accessed elements? */
-isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr);
-/* Does the access expression "expr" write to the accessed elements? */
-isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr);
-/* Does the access expression "expr" kill the accessed elements? */
-isl_bool pet_expr_access_is_kill(__isl_keep pet_expr *expr);
-/* Mark "expr" as a read depending on "read". */
-__isl_give pet_expr *pet_expr_access_set_read(__isl_take pet_expr *expr,
-	int read);
-/* Mark "expr" as a write depending on "write". */
-__isl_give pet_expr *pet_expr_access_set_write(__isl_take pet_expr *expr,
-	int write);
-/* Mark "expr" as a kill depending on "kill". */
-__isl_give pet_expr *pet_expr_access_set_kill(__isl_take pet_expr *expr,
-	int kill);
-/* Return the reference identifier of access expression "expr". */
-__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr);
-/* Replace the reference identifier of access expression "expr" by "ref_id". */
-__isl_give pet_expr *pet_expr_access_set_ref_id(__isl_take pet_expr *expr,
-	__isl_take isl_id *ref_id);
-/* Return the identifier of the outer array accessed by "expr". */
-__isl_give isl_id *pet_expr_access_get_id(__isl_keep pet_expr *expr);
-/* Return the index expression of access expression "expr". */
-__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
-	__isl_keep pet_expr *expr);
-
-/* Return the potential read access relation of access expression "expr". */
-__isl_give isl_union_map *pet_expr_access_get_may_read(
-	__isl_keep pet_expr *expr);
-/* Return the potential write access relation of access expression "expr". */
-__isl_give isl_union_map *pet_expr_access_get_may_write(
-	__isl_keep pet_expr *expr);
-/* Return the definite write access relation of access expression "expr". */
-__isl_give isl_union_map *pet_expr_access_get_must_write(
-	__isl_keep pet_expr *expr);
-/* Return the argument dependent potential read access relation of "expr". */
-__isl_give isl_union_map *pet_expr_access_get_dependent_may_read(
-	__isl_keep pet_expr *expr);
-/* Return the argument dependent potential write access relation of "expr". */
-__isl_give isl_union_map *pet_expr_access_get_dependent_may_write(
-	__isl_keep pet_expr *expr);
-/* Return the argument dependent definite write access relation of "expr". */
-__isl_give isl_union_map *pet_expr_access_get_dependent_must_write(
-	__isl_keep pet_expr *expr);
-/* Return the tagged potential read access relation of access "expr". */
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
-	__isl_keep pet_expr *expr);
-/* Return the tagged potential write access relation of access "expr". */
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
-	__isl_keep pet_expr *expr);
-
-/* Return the name of the function called by "expr". */
-__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr);
-/* Replace the name of the function called by "expr" by "name". */
-__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
-	__isl_keep const char *name);
-
-/* Create a pet_expr representing a cast of "arg" to "type_name". */
-__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
-	__isl_take pet_expr *arg);
-/* Replace the type of the cast performed by "expr" by "name". */
-__isl_give pet_expr *pet_expr_cast_set_type_name(__isl_take pet_expr *expr,
-	__isl_keep const char *name);
-
-/* Return the value of the integer represented by "expr". */
-__isl_give isl_val *pet_expr_int_get_val(__isl_keep pet_expr *expr);
-/* Replace the value of the integer represented by "expr" by "v". */
-__isl_give pet_expr *pet_expr_int_set_val(__isl_take pet_expr *expr,
-	__isl_take isl_val *v);
-
-/* Return a string representation of the double expression "expr". */
-__isl_give char *pet_expr_double_get_str(__isl_keep pet_expr *expr);
-/* Replace value and string representation of the double expression "expr" */
-__isl_give pet_expr *pet_expr_double_set(__isl_take pet_expr *expr,
-	double d, __isl_keep const char *s);
-
-/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_access. */
-int pet_expr_foreach_access_expr(__isl_keep pet_expr *expr,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_call. */
-int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-
-struct pet_context;
-typedef struct pet_context pet_context;
-
-/* Create a context with the given domain. */
-__isl_give pet_context *pet_context_alloc(__isl_take isl_set *domain);
-/* Return an additional reference to "pc". */
-__isl_give pet_context *pet_context_copy(__isl_keep pet_context *pc);
-/* Free a reference to "pc". */
-__isl_null pet_context *pet_context_free(__isl_take pet_context *pc);
-
-/* Return the isl_ctx in which "pc" was created. */
-isl_ctx *pet_context_get_ctx(__isl_keep pet_context *pc);
-
-/* Extract an affine expression defined over the domain of "pc" from "expr"
- * or return NaN.
- */
-__isl_give isl_pw_aff *pet_expr_extract_affine(__isl_keep pet_expr *expr,
-	__isl_keep pet_context *pc);
-
-void pet_expr_dump(__isl_keep pet_expr *expr);
-
-enum pet_tree_type {
-	pet_tree_error = -1,
-	pet_tree_expr,
-	pet_tree_block,
-	pet_tree_break,
-	pet_tree_continue,
-	pet_tree_decl,		/* A declaration without initialization */
-	pet_tree_decl_init,	/* A declaration with initialization */
-	pet_tree_if,		/* An if without an else branch */
-	pet_tree_if_else,	/* An if with an else branch */
-	pet_tree_for,
-	pet_tree_infinite_loop,
-	pet_tree_while,
-	pet_tree_return,
-};
-
-struct pet_tree;
-typedef struct pet_tree pet_tree;
-
-/* Return the isl_ctx in which "tree" was created. */
-isl_ctx *pet_tree_get_ctx(__isl_keep pet_tree *tree);
-
-/* Return an additional reference to "tree". */
-__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree);
-/* Free a reference to "tree". */
-__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree);
-
-/* Return the location of "tree". */
-__isl_give pet_loc *pet_tree_get_loc(__isl_keep pet_tree *tree);
-
-/* Return the type of "tree". */
-enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree);
-
-/* Return the expression of the expression tree "tree". */
-__isl_give pet_expr *pet_tree_expr_get_expr(__isl_keep pet_tree *tree);
-
-/* Return the expression returned by the return tree "tree". */
-__isl_give pet_expr *pet_tree_return_get_expr(__isl_keep pet_tree *tree);
-
-/* Return the number of children of the block tree "tree". */
-int pet_tree_block_n_child(__isl_keep pet_tree *tree);
-/* Return child "pos" of the block tree "tree". */
-__isl_give pet_tree *pet_tree_block_get_child(__isl_keep pet_tree *tree,
-	int pos);
-
-/* Is "tree" a declaration (with or without initialization)? */
-int pet_tree_is_decl(__isl_keep pet_tree *tree);
-/* Return the variable declared by the declaration tree "tree". */
-__isl_give pet_expr *pet_tree_decl_get_var(__isl_keep pet_tree *tree);
-/* Return the initial value of the pet_tree_decl_init tree "tree". */
-__isl_give pet_expr *pet_tree_decl_get_init(__isl_keep pet_tree *tree);
-
-/* Return the condition of the if tree "tree". */
-__isl_give pet_expr *pet_tree_if_get_cond(__isl_keep pet_tree *tree);
-/* Return the then branch of the if tree "tree". */
-__isl_give pet_tree *pet_tree_if_get_then(__isl_keep pet_tree *tree);
-/* Return the else branch of the if tree with else branch "tree". */
-__isl_give pet_tree *pet_tree_if_get_else(__isl_keep pet_tree *tree);
-
-/* Is "tree" a for loop, a while loop or an infinite loop? */
-int pet_tree_is_loop(__isl_keep pet_tree *tree);
-/* Return the induction variable of the for loop "tree" */
-__isl_give pet_expr *pet_tree_loop_get_var(__isl_keep pet_tree *tree);
-/* Return the initial value of the induction variable of the for loop "tree" */
-__isl_give pet_expr *pet_tree_loop_get_init(__isl_keep pet_tree *tree);
-/* Return the condition of the loop tree "tree" */
-__isl_give pet_expr *pet_tree_loop_get_cond(__isl_keep pet_tree *tree);
-/* Return the induction variable of the for loop "tree" */
-__isl_give pet_expr *pet_tree_loop_get_inc(__isl_keep pet_tree *tree);
-/* Return the body of the loop tree "tree" */
-__isl_give pet_tree *pet_tree_loop_get_body(__isl_keep pet_tree *tree);
-
-/* Call "fn" on each top-level expression in the nodes of "tree" */
-int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-/* Call "fn" on each access subexpression in the nodes of "tree" */
-int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-/* Modify all call subexpressions in the nodes of "tree" through "fn". */
-__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
-	__isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
-	void *user);
-
-void pet_tree_dump(__isl_keep pet_tree *tree);
-
-/* "loc" represents the region of the source code that is represented
- * by this statement.
- *
- * If the statement has arguments, i.e., n_arg != 0, then
- * "domain" is a wrapped map, mapping the iteration domain
- * to the values of the arguments for which this statement
- * is executed.
- * Otherwise, it is simply the iteration domain.
- *
- * If one of the arguments is an access expression that accesses
- * more than one element for a given iteration, then the constraints
- * on the value of this argument (encoded in "domain") should be satisfied
- * for all of those accessed elements.
- */
-struct pet_stmt {
-	pet_loc *loc;
-	isl_set *domain;
-	pet_tree *body;
-
-	unsigned n_arg;
-	pet_expr **args;
-};
-
-/* Return the iteration space of "stmt". */
-__isl_give isl_space *pet_stmt_get_space(struct pet_stmt *stmt);
-
-/* Is "stmt" an assignment statement? */
-int pet_stmt_is_assign(struct pet_stmt *stmt);
-/* Is "stmt" a kill statement? */
-int pet_stmt_is_kill(struct pet_stmt *stmt);
-
-/* pet_stmt_build_ast_exprs is currently limited to only handle
- * some forms of data dependent accesses.
- * If pet_stmt_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
- * can safely be called on "stmt".
- */
-int pet_stmt_can_build_ast_exprs(struct pet_stmt *stmt);
-/* Construct an associative array from reference identifiers of
- * access expressions in "stmt" to the corresponding isl_ast_expr.
- * Each index expression is first transformed through "fn_index"
- * (if not NULL).  Then an AST expression is generated using "build".
- * Finally, the AST expression is transformed using "fn_expr"
- * (if not NULL).
- */
-__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
-	__isl_keep isl_ast_build *build,
-	__isl_give isl_multi_pw_aff *(*fn_index)(
-		__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
-		void *user), void *user_index,
-	__isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
-		__isl_keep isl_id *id, void *user), void *user_expr);
-
-/* Print "stmt" to "p".
- *
- * The access expressions in "stmt" are replaced by the isl_ast_expr
- * associated to its reference identifier in "ref2expr".
- */
-__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
-	__isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr);
-
-/* This structure represents a defined type.
- * "name" is the name of the type, while "definition" is a string
- * representation of its definition.
- */
-struct pet_type {
-	char *name;
-	char *definition;
-};
-
-/* context holds constraints on the parameter that ensure that
- * this array has a valid (i.e., non-negative) size
- *
- * extent holds constraints on the indices
- *
- * value_bounds holds constraints on the elements of the array
- * and may be NULL if no such constraints were specified by the user
- *
- * element_size is the size in bytes of each array element
- * element_type is the type of the array elements.
- * element_is_record is set if this type is a record type.
- *
- * live_out is set if the array appears in a live-out pragma
- *
- * if uniquely_defined is set then the array is written by a single access
- * such that any element that is ever read
- * is known to be assigned exactly once before the read
- *
- * declared is set if the array was declared somewhere inside the scop.
- * exposed is set if the declared array is visible outside the scop.
- * outer is set if the type of the array elements is a record and
- * the fields of this record are represented by separate pet_array structures.
- */
-struct pet_array {
-	isl_set *context;
-	isl_set *extent;
-	isl_set *value_bounds;
-	char *element_type;
-	int element_is_record;
-	int element_size;
-	int live_out;
-	int uniquely_defined;
-	int declared;
-	int exposed;
-	int outer;
-};
-
-/* This structure represents an implication on a boolean filter.
- * In particular, if the filter value of an element in the domain
- * of "extension" is equal to "satisfied", then the filter values
- * of the corresponding images in "extension" are also equal
- * to "satisfied".
- */
-struct pet_implication {
-	int satisfied;
-	isl_map *extension;
-};
-
-/* This structure represents an independence implied by a for loop
- * that is marked as independent in the source code.
- * "filter" contains pairs of statement instances that are guaranteed
- * not to be dependent on each other based on the independent for loop,
- * assuming that no dependences carried by this loop are implied
- * by the variables in "local".
- * "local" contains the variables that are local to the loop that was
- * marked independent.
- */
-struct pet_independence {
-	isl_union_map *filter;
-	isl_union_set *local;
-};
-
-/* "loc" represents the region of the source code that is represented
- * by this scop.
- * If the scop was detected based on scop and endscop pragmas, then
- * the lines containing these pragmas are included in this region.
- * In the final result, the context describes the set of parameter values
- * for which the scop can be executed.
- * During the construction of the pet_scop, the context lives in a set space
- * where each dimension refers to an outer loop.
- * context_value describes assignments to the parameters (if any)
- * outside of the scop.
- *
- * "schedule" is the schedule of the statements in the scop.
- *
- * The n_type types define types that may be referenced from by the arrays.
- *
- * The n_implication implications describe implications on boolean filters.
- *
- * The n_independence independences describe independences implied
- * by for loops that are marked independent in the source code.
- */
-struct pet_scop {
-	pet_loc *loc;
-
-	isl_set *context;
-	isl_set *context_value;
-	isl_schedule *schedule;
-
-	int n_type;
-	struct pet_type **types;
-
-	int n_array;
-	struct pet_array **arrays;
-
-	int n_stmt;
-	struct pet_stmt **stmts;
-
-	int n_implication;
-	struct pet_implication **implications;
-
-	int n_independence;
-	struct pet_independence **independences;
-};
-typedef struct pet_scop pet_scop;
-
-/* Return a textual representation of the operator. */
-const char *pet_op_str(enum pet_op_type op);
-int pet_op_is_inc_dec(enum pet_op_type op);
-
-/* Extract a pet_scop from a C source file.
- * If function is not NULL, then the pet_scop is extracted from
- * a function with that name.
- */
-__isl_give pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx,
-	const char *filename, const char *function);
-
-/* Transform the C source file "input" by rewriting each scop
- * When autodetecting scops, at most one scop per function is rewritten.
- * The transformed C code is written to "output".
- */
-int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
-	__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
-		__isl_take pet_scop *scop, void *user), void *user);
-/* Given a scop and a printer passed to a pet_transform_C_source callback,
- * print the original corresponding code to the printer.
- */
-__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
-	__isl_take isl_printer *p);
-
-/* Update all isl_sets and isl_maps such that they all have the same
- * parameters in the same order.
- */
-__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop);
-
-/* Does "scop" contain any data dependent accesses? */
-int pet_scop_has_data_dependent_accesses(__isl_keep pet_scop *scop);
-/* Does "scop" contain any data dependent conditions? */
-int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop);
-/* pet_stmt_build_ast_exprs is currently limited to only handle
- * some forms of data dependent accesses.
- * If pet_scop_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
- * can safely be called on all statements in the scop.
- */
-int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop);
-
-void pet_scop_dump(__isl_keep pet_scop *scop);
-__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop);
-
-/* Return the context of "scop". */
-__isl_give isl_set *pet_scop_get_context(__isl_keep pet_scop *scop);
-/* Return the schedule of "scop". */
-__isl_give isl_schedule *pet_scop_get_schedule(__isl_keep pet_scop *scop);
-/* Return the set of all statement instances. */
-__isl_give isl_union_set *pet_scop_get_instance_set(__isl_keep pet_scop *scop);
-/* Return the potential read access relation. */
-__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop);
-/* Return the tagged potential read access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
-	__isl_keep pet_scop *scop);
-/* Return the potential write access relation. */
-__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop);
-/* Return the definite write access relation. */
-__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop);
-/* Return the tagged potential write access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
-	__isl_keep pet_scop *scop);
-/* Return the tagged definite write access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
-	__isl_keep pet_scop *scop);
-/* Return the definite kill access relation. */
-__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop);
-/* Return the tagged definite kill access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
-	__isl_keep pet_scop *scop);
-
-/* Compute a mapping from all outermost arrays (of structs) in scop
- * to their innermost members.
- */
-__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
-	__isl_keep pet_scop *scop);
-/* Compute a mapping from all outermost arrays (of structs) in scop
- * to their members, including the outermost arrays themselves.
- */
-__isl_give isl_union_map *pet_scop_compute_outer_to_any(
-	__isl_keep pet_scop *scop);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
diff --git a/polly/lib/External/ppcg/ChangeLog b/polly/lib/External/ppcg/ChangeLog
deleted file mode 100644
--- a/polly/lib/External/ppcg/ChangeLog
+++ /dev/null
@@ -1,29 +0,0 @@
-version: 0.07
-date: Tue Feb  7 17:23:22 CET 2017
-changes:
-	- support hybrid tiling
----
-version: 0.06
-date: Fri May  6 12:08:50 CEST 2016
-changes:
-	- use PPCG specific macro names in generated code
-	- complete transition to schedule trees
-	- maximize coincidence by default
-	- map arrays with constant index expressions to private memory
-	- optionally group chains of statements
----
-version: 0.05
-date: Fri Jan 15 09:30:23 CET 2016
-changes:
-	- fix live-out computation
-	- optionally compute schedule for C target
-	- optionally perform tiling for C target
-	- create single kernel for non-permutable subtree
----
-version: 0.04
-date: Wed Jun 17 10:52:58 CEST 2015
-changes:
-	- use schedule trees
-	- fix live-range reordering
-	- improve generation of synchronization
-	- exploit independences during dependence analysis
diff --git a/polly/lib/External/ppcg/GIT_HEAD_ID b/polly/lib/External/ppcg/GIT_HEAD_ID
deleted file mode 100644
--- a/polly/lib/External/ppcg/GIT_HEAD_ID
+++ /dev/null
@@ -1 +0,0 @@
-ppcg-0.07
diff --git a/polly/lib/External/ppcg/README b/polly/lib/External/ppcg/README
deleted file mode 100644
--- a/polly/lib/External/ppcg/README
+++ /dev/null
@@ -1,246 +0,0 @@
-Requirements:
-
-- automake, autoconf, libtool
-	(not needed when compiling a release)
-- pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config)
-	(not needed when compiling a release using the included isl and pet)
-- gmp (http://gmplib.org/)
-- libyaml (http://pyyaml.org/wiki/LibYAML)
-	(only needed if you want to compile the pet executable)
-- LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html)
-	Unless you have some other reasons for wanting to use the svn version,
-	it is best to install the latest release (3.9).
-	For more details, see pet/README.
-
-If you are installing on Ubuntu, then you can install the following packages:
-
-automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm
-
-Note that you need at least version 3.2 of libclang-dev (ubuntu raring).
-Older versions of this package did not include the required libraries.
-If you are using an older version of ubuntu, then you need to compile and
-install LLVM/clang from source.
-
-
-Preparing:
-
-Grab the latest release and extract it or get the source from
-the git repository as follows.  This process requires autoconf,
-automake, libtool and pkg-config.
-
-	git clone git://repo.or.cz/ppcg.git
-	cd ppcg
-	./get_submodules.sh
-	./autogen.sh
-
-
-Compilation:
-
-	./configure
-	make
-	make check
-
-If you have installed any of the required libraries in a non-standard
-location, then you may need to use the --with-gmp-prefix,
---with-libyaml-prefix and/or --with-clang-prefix options
-when calling "./configure".
-
-
-Using PPCG to generate CUDA or OpenCL code
-
-To convert a fragment of a C program to CUDA, insert a line containing
-
-	#pragma scop
-
-before the fragment and add a line containing
-
-	#pragma endscop
-
-after the fragment.  To generate CUDA code run
-	
-	ppcg --target=cuda file.c
-
-where file.c is the file containing the fragment.  The generated
-code is stored in file_host.cu and file_kernel.cu.
-
-To generate OpenCL code run
-
-	ppcg --target=opencl file.c
-
-where file.c is the file containing the fragment.  The generated code
-is stored in file_host.c and file_kernel.cl.
-
-
-Specifying tile, grid and block sizes
-
-The iterations space tile size, grid size and block size can
-be specified using the --sizes option.  The argument is a union map
-in isl notation mapping kernels identified by their sequence number
-in a "kernel" space to singleton sets in the "tile", "grid" and "block"
-spaces.  The sizes are specified outermost to innermost.
-
-The dimension of the "tile" space indicates the (maximal) number of loop
-dimensions to tile.  The elements of the single integer tuple
-specify the tile sizes in each dimension.
-In case of hybrid tiling, the first element is half the size of
-the tile in the time (sequential) dimension.  The second element
-specifies the number of elements in the base of the hexagon.
-The remaining elements specify the tile sizes in the remaining space
-dimensions.
-
-The dimension of the "grid" space indicates the (maximal) number of block
-dimensions in the grid.  The elements of the single integer tuple
-specify the number of blocks in each dimension.
-
-The dimension of the "block" space indicates the (maximal) number of thread
-dimensions in the grid.  The elements of the single integer tuple
-specify the number of threads in each dimension.
-
-For example,
-
-    { kernel[0] -> tile[64,64]; kernel[i] -> block[16] : i != 4 }
-
-specifies that in kernel 0, two loops should be tiled with a tile
-size of 64 in both dimensions and that all kernels except kernel 4
-should be run using a block of 16 threads.
-
-Since PPCG performs some scheduling, it can be difficult to predict
-what exactly will end up in a kernel.  If you want to specify
-tile, grid or block sizes, you may want to run PPCG first with the defaults,
-examine the kernels and then run PPCG again with the desired sizes.
-Instead of examining the kernels, you can also specify the option
---dump-sizes on the first run to obtain the effectively used default sizes.
-
-
-Compiling the generated CUDA code with nvcc
-
-To get optimal performance from nvcc, it is important to choose --arch
-according to your target GPU.  Specifically, use the flag "--arch sm_20"
-for fermi, "--arch sm_30" for GK10x Kepler and "--arch sm_35" for
-GK110 Kepler.  We discourage the use of older cards as we have seen
-correctness issues with compilation for older architectures.
-Note that in the absence of any --arch flag, nvcc defaults to
-"--arch sm_13". This will not only be slower, but can also cause
-correctness issues.
-If you want to obtain results that are identical to those obtained
-by the original code, then you may need to disable some optimizations
-by passing the "--fmad=false" option.
-
-
-Compiling the generated OpenCL code with gcc
-
-To compile the host code you need to link against the file
-ocl_utilities.c which contains utility functions used by the generated
-OpenCL host code.  To compile the host code with gcc, run
-
-  gcc -std=c99 file_host.c ocl_utilities.c -lOpenCL
-
-Note that we have experienced the generated OpenCL code freezing
-on some inputs (e.g., the PolyBench symm benchmark) when using
-at least some version of the Nvidia OpenCL library, while the
-corresponding CUDA code runs fine.
-We have experienced no such freezes when using AMD, ARM or Intel
-OpenCL libraries.
-
-By default, the compiled executable will need the _kernel.cl file at
-run time.  Alternatively, the option --opencl-embed-kernel-code may be
-given to place the kernel code in a string literal.  The kernel code is
-then compiled into the host binary, such that the _kernel.cl file is no
-longer needed at run time.  Any kernel include files, in particular
-those supplied using --opencl-include-file, will still be required at
-run time.
-
-
-Function calls
-
-Function calls inside the analyzed fragment are reproduced
-in the CUDA or OpenCL code, but for now it is left to the user
-to make sure that the functions that are being called are
-available from the generated kernels.
-
-In the case of OpenCL code, the --opencl-include-file option
-may be used to specify one or more files to be #include'd
-from the generated code.  These files may then contain
-the definitions of the functions being called from the
-program fragment.  If the pathnames of the included files
-are relative to the current directory, then you may need
-to additionally specify the --opencl-compiler-options=-I.
-to make sure that the files can be found by the OpenCL compiler.
-The included files may contain definitions of types used by the
-generated kernels.  By default, PPCG generates definitions for
-types as needed, but these definitions may collide with those in
-the included files, as PPCG does not consider the contents of the
-included files.  The --no-opencl-print-kernel-types will prevent
-PPCG from generating type definitions.
-
-
-GNU extensions
-
-By default, PPCG may print out macro definitions that involve
-GNU extensions such as __typeof__ and statement expressions.
-Some compilers may not support these extensions.
-In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918)
-has been reported not to support __typeof__.
-The use of these extensions can be turned off with the
---no-allow-gnu-extensions option.
-
-
-Processing PolyBench
-
-When processing a PolyBench/C 3.2 benchmark, you should always specify
--DPOLYBENCH_USE_C99_PROTO on the ppcg command line.  Otherwise, the source
-files are inconsistent, having fixed size arrays but parametrically
-bounded loops iterating over them.
-However, you should not specify this define when compiling
-the PPCG generated code using nvcc since CUDA does not support VLAs.
-
-
-CUDA and function overloading
-
-While CUDA supports function overloading based on the arguments types,
-no such function overloading exists in the input language C.  Since PPCG
-simply prints out the same function name as in the original code, this
-may result in a different function being called based on the types
-of the arguments.  For example, if the original code contains a call
-to the function sqrt() with a float argument, then the argument will
-be promoted to a double and the sqrt() function will be called.
-In the transformed (CUDA) code, however, overloading will cause the
-function sqrtf() to be called.  Until this issue has been resolved in PPCG,
-we recommend that users either explicitly call the function sqrtf() or
-explicitly cast the argument to double in the input code.
-
-
-Contact
-
-For bug reports, feature requests and questions,
-contact http://groups.google.com/group/isl-development
-
-Whenever you report a bug, please mention the exact version of PPCG
-that you are using (output of "./ppcg --version").  If you are unable
-to compile PPCG, then report the git version (output of "git describe")
-or the version number included in the name of the tarball.
-
-
-Citing PPCG
-
-If you use PPCG for your research, you are invited to cite
-the following paper.
-
-@article{Verdoolaege2013PPCG,
-    author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and
-		G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and
-		Catthoor, Francky},
-    title = {Polyhedral parallel code generation for CUDA},
-    journal = {ACM Trans. Archit. Code Optim.},
-    issue_date = {January 2013},
-    volume = {9},
-    number = {4},
-    month = jan,
-    year = {2013},
-    issn = {1544-3566},
-    pages = {54:1--54:23},
-    doi = {10.1145/2400682.2400713},
-    acmid = {2400713},
-    publisher = {ACM},
-    address = {New York, NY, USA},
-}
diff --git a/polly/lib/External/ppcg/cpu.h b/polly/lib/External/ppcg/cpu.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/cpu.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _CPU_H
-#define _CPU_H
-
-#include <isl/ctx.h>
-
-#include "ppcg.h"
-
-struct ppcg_options;
-
-__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
-	struct ppcg_scop *ps, struct ppcg_options *options);
-int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input, const char *output);
-
-#endif
diff --git a/polly/lib/External/ppcg/cpu.c b/polly/lib/External/ppcg/cpu.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/cpu.c
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * Copyright 2012 INRIA Paris-Rocquencourt
- * Copyright 2012 Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Tobias Grosser, INRIA Paris-Rocquencourt,
- * Domaine de Voluceau, Rocquenqourt, B.P. 105,
- * 78153 Le Chesnay Cedex France
- * and Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <limits.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <isl/aff.h>
-#include <isl/ctx.h>
-#include <isl/flow.h>
-#include <isl/map.h>
-#include <isl/ast_build.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-#include <pet.h>
-
-#include "ppcg.h"
-#include "ppcg_options.h"
-#include "cpu.h"
-#include "print.h"
-#include "schedule.h"
-#include "util.h"
-
-/* Representation of a statement inside a generated AST.
- *
- * "stmt" refers to the original statement.
- * "ref2expr" maps the reference identifier of each access in
- * the statement to an AST expression that should be printed
- * at the place of the access.
- */
-struct ppcg_stmt {
-	struct pet_stmt *stmt;
-
-	isl_id_to_ast_expr *ref2expr;
-};
-
-static void ppcg_stmt_free(void *user)
-{
-	struct ppcg_stmt *stmt = user;
-
-	if (!stmt)
-		return;
-
-	isl_id_to_ast_expr_free(stmt->ref2expr);
-
-	free(stmt);
-}
-
-/* Derive the output file name from the input file name.
- * 'input' is the entire path of the input file. The output
- * is the file name plus the additional extension.
- *
- * We will basically replace everything after the last point
- * with '.ppcg.c'. This means file.c becomes file.ppcg.c
- */
-static FILE *get_output_file(const char *input, const char *output)
-{
-	char name[PATH_MAX];
-	const char *ext;
-	const char ppcg_marker[] = ".ppcg";
-	int len;
-	FILE *file;
-
-	len = ppcg_extract_base_name(name, input);
-
-	strcpy(name + len, ppcg_marker);
-	ext = strrchr(input, '.');
-	strcpy(name + len + sizeof(ppcg_marker) - 1, ext ? ext : ".c");
-
-	if (!output)
-		output = name;
-
-	file = fopen(output, "w");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for writing\n", output);
-		return NULL;
-	}
-
-	return file;
-}
-
-/* Data used to annotate for nodes in the ast.
- */
-struct ast_node_userinfo {
-	/* The for node is an openmp parallel for node. */
-	int is_openmp;
-};
-
-/* Information used while building the ast.
- */
-struct ast_build_userinfo {
-	/* The current ppcg scop. */
-	struct ppcg_scop *scop;
-
-	/* Are we currently in a parallel for loop? */
-	int in_parallel_for;
-};
-
-/* Check if the current scheduling dimension is parallel.
- *
- * We check for parallelism by verifying that the loop does not carry any
- * dependences.
- * If the live_range_reordering option is set, then this currently
- * includes the order dependences.  In principle, non-zero order dependences
- * could be allowed, but this would require privatization and/or expansion.
- *
- * Parallelism test: if the distance is zero in all outer dimensions, then it
- * has to be zero in the current dimension as well.
- * Implementation: first, translate dependences into time space, then force
- * outer dimensions to be equal.  If the distance is zero in the current
- * dimension, then the loop is parallel.
- * The distance is zero in the current dimension if it is a subset of a map
- * with equal values for the current dimension.
- */
-static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build,
-	struct ppcg_scop *scop)
-{
-	isl_union_map *schedule, *deps;
-	isl_map *schedule_deps, *test;
-	isl_space *schedule_space;
-	unsigned i, dimension, is_parallel;
-
-	schedule = isl_ast_build_get_schedule(build);
-	schedule_space = isl_ast_build_get_schedule_space(build);
-
-	dimension = isl_space_dim(schedule_space, isl_dim_out) - 1;
-
-	deps = isl_union_map_copy(scop->dep_flow);
-	deps = isl_union_map_union(deps, isl_union_map_copy(scop->dep_false));
-	if (scop->options->live_range_reordering) {
-		isl_union_map *order = isl_union_map_copy(scop->dep_order);
-		deps = isl_union_map_union(deps, order);
-	}
-	deps = isl_union_map_apply_range(deps, isl_union_map_copy(schedule));
-	deps = isl_union_map_apply_domain(deps, schedule);
-
-	if (isl_union_map_is_empty(deps)) {
-		isl_union_map_free(deps);
-		isl_space_free(schedule_space);
-		return 1;
-	}
-
-	schedule_deps = isl_map_from_union_map(deps);
-
-	for (i = 0; i < dimension; i++)
-		schedule_deps = isl_map_equate(schedule_deps, isl_dim_out, i,
-					       isl_dim_in, i);
-
-	test = isl_map_universe(isl_map_get_space(schedule_deps));
-	test = isl_map_equate(test, isl_dim_out, dimension, isl_dim_in,
-			      dimension);
-	is_parallel = isl_map_is_subset(schedule_deps, test);
-
-	isl_space_free(schedule_space);
-	isl_map_free(test);
-	isl_map_free(schedule_deps);
-
-	return is_parallel;
-}
-
-/* Mark a for node openmp parallel, if it is the outermost parallel for node.
- */
-static void mark_openmp_parallel(__isl_keep isl_ast_build *build,
-	struct ast_build_userinfo *build_info,
-	struct ast_node_userinfo *node_info)
-{
-	if (build_info->in_parallel_for)
-		return;
-
-	if (ast_schedule_dim_is_parallel(build, build_info->scop)) {
-		build_info->in_parallel_for = 1;
-		node_info->is_openmp = 1;
-	}
-}
-
-/* Allocate an ast_node_info structure and initialize it with default values.
- */
-static struct ast_node_userinfo *allocate_ast_node_userinfo()
-{
-	struct ast_node_userinfo *node_info;
-	node_info = (struct ast_node_userinfo *)
-		malloc(sizeof(struct ast_node_userinfo));
-	node_info->is_openmp = 0;
-	return node_info;
-}
-
-/* Free an ast_node_info structure.
- */
-static void free_ast_node_userinfo(void *ptr)
-{
-	struct ast_node_userinfo *info;
-	info = (struct ast_node_userinfo *) ptr;
-	free(info);
-}
-
-/* This method is executed before the construction of a for node. It creates
- * an isl_id that is used to annotate the subsequently generated ast for nodes.
- *
- * In this function we also run the following analyses:
- *
- * 	- Detection of openmp parallel loops
- */
-static __isl_give isl_id *ast_build_before_for(
-	__isl_keep isl_ast_build *build, void *user)
-{
-	isl_id *id;
-	struct ast_build_userinfo *build_info;
-	struct ast_node_userinfo *node_info;
-
-	build_info = (struct ast_build_userinfo *) user;
-	node_info = allocate_ast_node_userinfo();
-	id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info);
-	id = isl_id_set_free_user(id, free_ast_node_userinfo);
-
-	mark_openmp_parallel(build, build_info, node_info);
-
-	return id;
-}
-
-/* This method is executed after the construction of a for node.
- *
- * It performs the following actions:
- *
- * 	- Reset the 'in_parallel_for' flag, as soon as we leave a for node,
- * 	  that is marked as openmp parallel.
- *
- */
-static __isl_give isl_ast_node *ast_build_after_for(
-	__isl_take isl_ast_node *node, __isl_keep isl_ast_build *build,
-	void *user)
-{
-	isl_id *id;
-	struct ast_build_userinfo *build_info;
-	struct ast_node_userinfo *info;
-
-	id = isl_ast_node_get_annotation(node);
-	info = isl_id_get_user(id);
-
-	if (info && info->is_openmp) {
-		build_info = (struct ast_build_userinfo *) user;
-		build_info->in_parallel_for = 0;
-	}
-
-	isl_id_free(id);
-
-	return node;
-}
-
-/* Find the element in scop->stmts that has the given "id".
- */
-static struct pet_stmt *find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id)
-{
-	int i;
-
-	for (i = 0; i < scop->pet->n_stmt; ++i) {
-		struct pet_stmt *stmt = scop->pet->stmts[i];
-		isl_id *id_i;
-
-		id_i = isl_set_get_tuple_id(stmt->domain);
-		isl_id_free(id_i);
-
-		if (id_i == id)
-			return stmt;
-	}
-
-	isl_die(isl_id_get_ctx(id), isl_error_internal,
-		"statement not found", return NULL);
-}
-
-/* Print a user statement in the generated AST.
- * The ppcg_stmt has been attached to the node in at_each_domain.
- */
-static __isl_give isl_printer *print_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	struct ppcg_stmt *stmt;
-	isl_id *id;
-
-	id = isl_ast_node_get_annotation(node);
-	stmt = isl_id_get_user(id);
-	isl_id_free(id);
-
-	p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr);
-
-	isl_ast_print_options_free(print_options);
-
-	return p;
-}
-
-
-/* Print a for loop node as an openmp parallel loop.
- *
- * To print an openmp parallel loop we print a normal for loop, but add
- * "#pragma openmp parallel for" in front.
- *
- * Variables that are declared within the body of this for loop are
- * automatically openmp 'private'. Iterators declared outside of the
- * for loop are automatically openmp 'shared'. As ppcg declares all iterators
- * at the position where they are assigned, there is no need to explicitly mark
- * variables. Their automatically assigned type is already correct.
- *
- * This function only generates valid OpenMP code, if the ast was generated
- * with the 'atomic-bounds' option enabled.
- *
- */
-static __isl_give isl_printer *print_for_with_openmp(
-	__isl_keep isl_ast_node *node, __isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "#pragma omp parallel for");
-	p = isl_printer_end_line(p);
-
-	p = isl_ast_node_for_print(node, p, print_options);
-
-	return p;
-}
-
-/* Print a for node.
- *
- * Depending on how the node is annotated, we either print a normal
- * for node or an openmp parallel for node.
- */
-static __isl_give isl_printer *print_for(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	isl_id *id;
-	int openmp;
-
-	openmp = 0;
-	id = isl_ast_node_get_annotation(node);
-
-	if (id) {
-		struct ast_node_userinfo *info;
-
-		info = (struct ast_node_userinfo *) isl_id_get_user(id);
-		if (info && info->is_openmp)
-			openmp = 1;
-	}
-
-	if (openmp)
-		p = print_for_with_openmp(node, p, print_options);
-	else
-		p = isl_ast_node_for_print(node, p, print_options);
-
-	isl_id_free(id);
-
-	return p;
-}
-
-/* Index transformation callback for pet_stmt_build_ast_exprs.
- *
- * "index" expresses the array indices in terms of statement iterators
- * "iterator_map" expresses the statement iterators in terms of
- * AST loop iterators.
- *
- * The result expresses the array indices in terms of
- * AST loop iterators.
- */
-static __isl_give isl_multi_pw_aff *pullback_index(
-	__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user)
-{
-	isl_pw_multi_aff *iterator_map = user;
-
-	iterator_map = isl_pw_multi_aff_copy(iterator_map);
-	return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
-}
-
-/* Transform the accesses in the statement associated to the domain
- * called by "node" to refer to the AST loop iterators, construct
- * corresponding AST expressions using "build",
- * collect them in a ppcg_stmt and annotate the node with the ppcg_stmt.
- */
-static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
-	__isl_keep isl_ast_build *build, void *user)
-{
-	struct ppcg_scop *scop = user;
-	isl_ast_expr *expr, *arg;
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_map *map;
-	isl_pw_multi_aff *iterator_map;
-	struct ppcg_stmt *stmt;
-
-	ctx = isl_ast_node_get_ctx(node);
-	stmt = isl_calloc_type(ctx, struct ppcg_stmt);
-	if (!stmt)
-		goto error;
-
-	expr = isl_ast_node_user_get_expr(node);
-	arg = isl_ast_expr_get_op_arg(expr, 0);
-	isl_ast_expr_free(expr);
-	id = isl_ast_expr_get_id(arg);
-	isl_ast_expr_free(arg);
-	stmt->stmt = find_stmt(scop, id);
-	isl_id_free(id);
-	if (!stmt->stmt)
-		goto error;
-
-	map = isl_map_from_union_map(isl_ast_build_get_schedule(build));
-	map = isl_map_reverse(map);
-	iterator_map = isl_pw_multi_aff_from_map(map);
-	stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build,
-				    &pullback_index, iterator_map, NULL, NULL);
-	isl_pw_multi_aff_free(iterator_map);
-
-	id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt);
-	id = isl_id_set_free_user(id, &ppcg_stmt_free);
-	return isl_ast_node_set_annotation(node, id);
-error:
-	ppcg_stmt_free(stmt);
-	return isl_ast_node_free(node);
-}
-
-/* Set *depth (initialized to 0 by the caller) to the maximum
- * of the schedule depths of the leaf nodes for which this function is called.
- */
-static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
-{
-	int *depth = user;
-	int node_depth;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
-		return isl_bool_true;
-	node_depth = isl_schedule_node_get_schedule_depth(node);
-	if (node_depth > *depth)
-		*depth = node_depth;
-
-	return isl_bool_false;
-}
-
-/* This function is called for each node in a CPU AST.
- * In case of a user node, print the macro definitions required
- * for printing the AST expressions in the annotation, if any.
- * For other nodes, return true such that descendants are also
- * visited.
- *
- * In particular, print the macro definitions needed for the substitutions
- * of the original user statements.
- */
-static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
-{
-	struct ppcg_stmt *stmt;
-	isl_id *id;
-	isl_printer **p = user;
-
-	if (isl_ast_node_get_type(node) != isl_ast_node_user)
-		return isl_bool_true;
-
-	id = isl_ast_node_get_annotation(node);
-	stmt = isl_id_get_user(id);
-	isl_id_free(id);
-
-	if (!stmt)
-		return isl_bool_error;
-
-	*p = ppcg_print_body_macros(*p, stmt->ref2expr);
-	if (!*p)
-		return isl_bool_error;
-
-	return isl_bool_false;
-}
-
-/* Print the required macros for the CPU AST "node" to "p",
- * including those needed for the user statements inside the AST.
- */
-static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node)
-{
-	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
-		return isl_printer_free(p);
-	p = ppcg_print_macros(p, node);
-	return p;
-}
-
-/* Code generate the scop 'scop' using "schedule"
- * and print the corresponding C code to 'p'.
- */
-static __isl_give isl_printer *print_scop(struct ppcg_scop *scop,
-	__isl_take isl_schedule *schedule, __isl_take isl_printer *p,
-	struct ppcg_options *options)
-{
-	isl_ctx *ctx = isl_printer_get_ctx(p);
-	isl_ast_build *build;
-	isl_ast_print_options *print_options;
-	isl_ast_node *tree;
-	isl_id_list *iterators;
-	struct ast_build_userinfo build_info;
-	int depth;
-
-	depth = 0;
-	if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
-						&depth) < 0)
-		goto error;
-
-	build = isl_ast_build_alloc(ctx);
-	iterators = ppcg_scop_generate_names(scop, depth, "c");
-	build = isl_ast_build_set_iterators(build, iterators);
-	build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop);
-
-	if (options->openmp) {
-		build_info.scop = scop;
-		build_info.in_parallel_for = 0;
-
-		build = isl_ast_build_set_before_each_for(build,
-							&ast_build_before_for,
-							&build_info);
-		build = isl_ast_build_set_after_each_for(build,
-							&ast_build_after_for,
-							&build_info);
-	}
-
-	tree = isl_ast_build_node_from_schedule(build, schedule);
-	isl_ast_build_free(build);
-
-	print_options = isl_ast_print_options_alloc(ctx);
-	print_options = isl_ast_print_options_set_print_user(print_options,
-							&print_user, NULL);
-
-	print_options = isl_ast_print_options_set_print_for(print_options,
-							&print_for, NULL);
-
-	p = cpu_print_macros(p, tree);
-	p = isl_ast_node_print(tree, p, print_options);
-
-	isl_ast_node_free(tree);
-
-	return p;
-error:
-	isl_schedule_free(schedule);
-	isl_printer_free(p);
-	return NULL;
-}
-
-/* Tile the band node "node" with tile sizes "sizes" and
- * mark all members of the resulting tile node as "atomic".
- */
-static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node,
-	__isl_take isl_multi_val *sizes)
-{
-	node = isl_schedule_node_band_tile(node, sizes);
-	node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
-
-	return node;
-}
-
-/* Tile "node", if it is a band node with at least 2 members.
- * The tile sizes are set from the "tile_size" option.
- */
-static __isl_give isl_schedule_node *tile_band(
-	__isl_take isl_schedule_node *node, void *user)
-{
-	struct ppcg_scop *scop = user;
-	int n;
-	isl_space *space;
-	isl_multi_val *sizes;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		return node;
-
-	n = isl_schedule_node_band_n_member(node);
-	if (n <= 1)
-		return node;
-
-	space = isl_schedule_node_band_get_space(node);
-	sizes = ppcg_multi_val_from_int(space, scop->options->tile_size);
-
-	return tile(node, sizes);
-}
-
-/* Construct schedule constraints from the dependences in ps
- * for the purpose of computing a schedule for a CPU.
- *
- * The proximity constraints are set to the flow dependences.
- *
- * If live-range reordering is allowed then the conditional validity
- * constraints are set to the order dependences with the flow dependences
- * as condition.  That is, a live-range (flow dependence) will be either
- * local to an iteration of a band or all adjacent order dependences
- * will be respected by the band.
- * The validity constraints are set to the union of the flow dependences
- * and the forced dependences, while the coincidence constraints
- * are set to the union of the flow dependences, the forced dependences and
- * the order dependences.
- *
- * If live-range reordering is not allowed, then both the validity
- * and the coincidence constraints are set to the union of the flow
- * dependences and the false dependences.
- *
- * Note that the coincidence constraints are only set when the "openmp"
- * options is set.  Even though the way openmp pragmas are introduced
- * does not rely on the coincident property of the schedule band members,
- * the coincidence constraints do affect the way the schedule is constructed,
- * such that more schedule dimensions should be detected as parallel
- * by ast_schedule_dim_is_parallel.
- * Since the order dependences are also taken into account by
- * ast_schedule_dim_is_parallel, they are also added to
- * the coincidence constraints.  If the openmp handling learns
- * how to privatize some memory, then the corresponding order
- * dependences can be removed from the coincidence constraints.
- */
-static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints(
-	struct ppcg_scop *ps)
-{
-	isl_schedule_constraints *sc;
-	isl_union_map *validity, *coincidence;
-
-	sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain));
-	if (ps->options->live_range_reordering) {
-		sc = isl_schedule_constraints_set_conditional_validity(sc,
-				isl_union_map_copy(ps->tagged_dep_flow),
-				isl_union_map_copy(ps->tagged_dep_order));
-		validity = isl_union_map_copy(ps->dep_flow);
-		validity = isl_union_map_union(validity,
-				isl_union_map_copy(ps->dep_forced));
-		if (ps->options->openmp) {
-			coincidence = isl_union_map_copy(validity);
-			coincidence = isl_union_map_union(coincidence,
-					isl_union_map_copy(ps->dep_order));
-		}
-	} else {
-		validity = isl_union_map_copy(ps->dep_flow);
-		validity = isl_union_map_union(validity,
-				isl_union_map_copy(ps->dep_false));
-		if (ps->options->openmp)
-			coincidence = isl_union_map_copy(validity);
-	}
-	if (ps->options->openmp)
-		sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
-	sc = isl_schedule_constraints_set_validity(sc, validity);
-	sc = isl_schedule_constraints_set_proximity(sc,
-					isl_union_map_copy(ps->dep_flow));
-
-	return sc;
-}
-
-/* Compute a schedule for the scop "ps".
- *
- * First derive the appropriate schedule constraints from the dependences
- * in "ps" and then compute a schedule from those schedule constraints,
- * possibly grouping statement instances based on the input schedule.
- */
-static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps)
-{
-	isl_schedule_constraints *sc;
-	isl_schedule *schedule;
-
-	if (!ps)
-		return NULL;
-
-	sc = construct_cpu_schedule_constraints(ps);
-
-	if (ps->options->debug->dump_schedule_constraints)
-		isl_schedule_constraints_dump(sc);
-	schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options);
-
-	return schedule;
-}
-
-/* Compute a new schedule to the scop "ps" if the reschedule option is set.
- * Otherwise, return a copy of the original schedule.
- */
-static __isl_give isl_schedule *optionally_compute_schedule(void *user)
-{
-	struct ppcg_scop *ps = user;
-
-	if (!ps)
-		return NULL;
-	if (!ps->options->reschedule)
-		return isl_schedule_copy(ps->schedule);
-	return compute_cpu_schedule(ps);
-}
-
-/* Compute a schedule based on the dependences in "ps" and
- * tile it if requested by the user.
- */
-static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps,
-	struct ppcg_options *options)
-{
-	isl_ctx *ctx;
-	isl_schedule *schedule;
-
-	if (!ps)
-		return NULL;
-
-	ctx = isl_union_set_get_ctx(ps->domain);
-	schedule = ppcg_get_schedule(ctx, options,
-				    &optionally_compute_schedule, ps);
-	if (ps->options->tile)
-		schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
-							&tile_band, ps);
-
-	return schedule;
-}
-
-/* Generate CPU code for the scop "ps" using "schedule" and
- * print the corresponding C code to "p", including variable declarations.
- */
-static __isl_give isl_printer *print_cpu_with_schedule(
-	__isl_take isl_printer *p, struct ppcg_scop *ps,
-	__isl_take isl_schedule *schedule, struct ppcg_options *options)
-{
-	int hidden;
-	isl_set *context;
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "/* ppcg generated CPU code */");
-	p = isl_printer_end_line(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-
-	p = ppcg_set_macro_names(p);
-	p = ppcg_print_exposed_declarations(p, ps);
-	hidden = ppcg_scop_any_hidden_declarations(ps);
-	if (hidden) {
-		p = ppcg_start_block(p);
-		p = ppcg_print_hidden_declarations(p, ps);
-	}
-
-	context = isl_set_copy(ps->context);
-	context = isl_set_from_params(context);
-	schedule = isl_schedule_insert_context(schedule, context);
-	if (options->debug->dump_final_schedule)
-		isl_schedule_dump(schedule);
-	p = print_scop(ps, schedule, p, options);
-	if (hidden)
-		p = ppcg_end_block(p);
-
-	return p;
-}
-
-/* Generate CPU code for the scop "ps" and print the corresponding C code
- * to "p", including variable declarations.
- */
-__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
-	struct ppcg_scop *ps, struct ppcg_options *options)
-{
-	isl_schedule *schedule;
-
-	schedule = isl_schedule_copy(ps->schedule);
-	return print_cpu_with_schedule(p, ps, schedule, options);
-}
-
-/* Generate CPU code for "scop" and print it to "p".
- *
- * First obtain a schedule for "scop" and then print code for "scop"
- * using that schedule.
- */
-static __isl_give isl_printer *generate(__isl_take isl_printer *p,
-	struct ppcg_scop *scop, struct ppcg_options *options)
-{
-	isl_schedule *schedule;
-
-	schedule = get_schedule(scop, options);
-
-	return print_cpu_with_schedule(p, scop, schedule, options);
-}
-
-/* Wrapper around generate for use as a ppcg_transform callback.
- */
-static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p,
-	struct ppcg_scop *scop, void *user)
-{
-	struct ppcg_options *options = user;
-
-	return generate(p, scop, options);
-}
-
-/* Transform the code in the file called "input" by replacing
- * all scops by corresponding CPU code and write the results to a file
- * called "output".
- */
-int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input, const char *output)
-{
-	FILE *output_file;
-	int r;
-
-	output_file = get_output_file(input, output);
-	if (!output_file)
-		return -1;
-
-	r = ppcg_transform(ctx, input, output_file, options,
-					&print_cpu_wrap, options);
-
-	fclose(output_file);
-
-	return r;
-}
diff --git a/polly/lib/External/ppcg/cuda.h b/polly/lib/External/ppcg/cuda.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/cuda.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _CUDA_H
-#define _CUDA_H
-
-#include "ppcg_options.h"
-#include "ppcg.h"
-
-int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input);
-
-__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user);
-#endif
diff --git a/polly/lib/External/ppcg/cuda.c b/polly/lib/External/ppcg/cuda.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/cuda.c
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- * Copyright 2012      Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
- */
-
-#include <isl/aff.h>
-#include <isl/ast.h>
-
-#include "cuda_common.h"
-#include "cuda.h"
-#include "gpu.h"
-#include "gpu_print.h"
-#include "print.h"
-#include "util.h"
-
-static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
-{
-	const char *macros =
-		"#define cudaCheckReturn(ret) \\\n"
-		"  do { \\\n"
-		"    cudaError_t cudaCheckReturn_e = (ret); \\\n"
-		"    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
-		"      fprintf(stderr, \"CUDA error: %s\\n\", "
-		"cudaGetErrorString(cudaCheckReturn_e)); \\\n"
-		"      fflush(stderr); \\\n"
-		"    } \\\n"
-		"    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
-		"  } while(0)\n"
-		"#define cudaCheckKernel() \\\n"
-		"  do { \\\n"
-		"    cudaCheckReturn(cudaGetLastError()); \\\n"
-		"  } while(0)\n\n";
-
-	p = isl_printer_print_str(p, macros);
-	return p;
-}
-
-/* Print a declaration for the device array corresponding to "array" on "p".
- */
-static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
-	struct gpu_array_info *array)
-{
-	int i;
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, array->type);
-	p = isl_printer_print_str(p, " ");
-	if (!array->linearize && array->n_index > 1)
-		p = isl_printer_print_str(p, "(");
-	p = isl_printer_print_str(p, "*dev_");
-	p = isl_printer_print_str(p, array->name);
-	if (!array->linearize && array->n_index > 1) {
-		p = isl_printer_print_str(p, ")");
-		for (i = 1; i < array->n_index; i++) {
-			isl_ast_expr *bound;
-			bound = isl_ast_expr_get_op_arg(array->bound_expr,
-							1 + i);
-			p = isl_printer_print_str(p, "[");
-			p = isl_printer_print_ast_expr(p, bound);
-			p = isl_printer_print_str(p, "]");
-			isl_ast_expr_free(bound);
-		}
-	}
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		if (!gpu_array_requires_device_allocation(&prog->array[i]))
-			continue;
-
-		p = declare_device_array(p, &prog->array[i]);
-	}
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-	return p;
-}
-
-static __isl_give isl_printer *allocate_device_arrays(
-	__isl_take isl_printer *p, struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-
-		if (!gpu_array_requires_device_allocation(&prog->array[i]))
-			continue;
-		p = ppcg_ast_expr_print_macros(array->bound_expr, p);
-		p = isl_printer_start_line(p);
-		p = isl_printer_print_str(p,
-			"cudaCheckReturn(cudaMalloc((void **) &dev_");
-		p = isl_printer_print_str(p, prog->array[i].name);
-		p = isl_printer_print_str(p, ", ");
-		p = gpu_array_info_print_size(p, &prog->array[i]);
-		p = isl_printer_print_str(p, "));");
-		p = isl_printer_end_line(p);
-	}
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-	return p;
-}
-
-static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		if (!gpu_array_requires_device_allocation(&prog->array[i]))
-			continue;
-		p = isl_printer_start_line(p);
-		p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
-		p = isl_printer_print_str(p, prog->array[i].name);
-		p = isl_printer_print_str(p, "));");
-		p = isl_printer_end_line(p);
-	}
-
-	return p;
-}
-
-/* Print code to "p" for copying "array" from the host to the device
- * in its entirety.  The bounds on the extent of "array" have
- * been precomputed in extract_array_info and are used in
- * gpu_array_info_print_size.
- */
-static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
-	struct gpu_array_info *array)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", ");
-
-	if (gpu_array_is_scalar(array))
-		p = isl_printer_print_str(p, "&");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", ");
-
-	p = gpu_array_info_print_size(p, array);
-	p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Print code to "p" for copying "array" back from the device to the host
- * in its entirety.  The bounds on the extent of "array" have
- * been precomputed in extract_array_info and are used in
- * gpu_array_info_print_size.
- */
-static __isl_give isl_printer *copy_array_from_device(
-	__isl_take isl_printer *p, struct gpu_array_info *array)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
-	if (gpu_array_is_scalar(array))
-		p = isl_printer_print_str(p, "&");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", dev_");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", ");
-	p = gpu_array_info_print_size(p, array);
-	p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-static __isl_give isl_printer* print_reverse_list(__isl_take isl_printer *p, int len, int *list)
-{
-	int i;
-
-	if (len == 0)
-		return p;
-
-	p = isl_printer_print_str(p, "(");
-	for (i = 0; i < len; ++i) {
-		if (i)
-			p = isl_printer_print_str(p, ", ");
-		p = isl_printer_print_int(p, list[len - 1 - i]);
-	}
-	return isl_printer_print_str(p, ")");
-}
-
-/* Print the effective grid size as a list of the sizes in each
- * dimension, from innermost to outermost.
- */
-static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
-	struct ppcg_kernel *kernel)
-{
-	int i;
-	int dim;
-
-	dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
-	if (dim == 0)
-		return p;
-
-	p = isl_printer_print_str(p, "(");
-	for (i = dim - 1; i >= 0; --i) {
-		isl_ast_expr *bound;
-
-		bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
-		p = isl_printer_print_ast_expr(p, bound);
-		isl_ast_expr_free(bound);
-
-		if (i > 0)
-			p = isl_printer_print_str(p, ", ");
-	}
-
-	p = isl_printer_print_str(p, ")");
-
-	return p;
-}
-
-/* Print the grid definition.
- */
-static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
-	struct ppcg_kernel *kernel)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "dim3 k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimGrid");
-	p = print_grid_size(p, kernel);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Print the arguments to a kernel declaration or call.  If "types" is set,
- * then print a declaration (including the types of the arguments).
- *
- * The arguments are printed in the following order
- * - the arrays accessed by the kernel
- * - the parameters
- * - the host loop iterators
- */
-static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
-	struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
-{
-	int i, n;
-	int first = 1;
-	unsigned nparam;
-	isl_space *space;
-	const char *type;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		int required;
-
-		required = ppcg_kernel_requires_array_argument(kernel, i);
-		if (required < 0)
-			return isl_printer_free(p);
-		if (!required)
-			continue;
-
-		if (!first)
-			p = isl_printer_print_str(p, ", ");
-
-		if (types)
-			p = gpu_array_info_print_declaration_argument(p,
-				&prog->array[i], NULL);
-		else
-			p = gpu_array_info_print_call_argument(p,
-				&prog->array[i]);
-
-		first = 0;
-	}
-
-	space = isl_union_set_get_space(kernel->arrays);
-	nparam = isl_space_dim(space, isl_dim_param);
-	for (i = 0; i < nparam; ++i) {
-		const char *name;
-
-		name = isl_space_get_dim_name(space, isl_dim_param, i);
-
-		if (!first)
-			p = isl_printer_print_str(p, ", ");
-		if (types)
-			p = isl_printer_print_str(p, "int ");
-		p = isl_printer_print_str(p, name);
-
-		first = 0;
-	}
-	isl_space_free(space);
-
-	n = isl_space_dim(kernel->space, isl_dim_set);
-	type = isl_options_get_ast_iterator_type(prog->ctx);
-	for (i = 0; i < n; ++i) {
-		const char *name;
-
-		if (!first)
-			p = isl_printer_print_str(p, ", ");
-		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
-		if (types) {
-			p = isl_printer_print_str(p, type);
-			p = isl_printer_print_str(p, " ");
-		}
-		p = isl_printer_print_str(p, name);
-
-		first = 0;
-	}
-
-	return p;
-}
-
-/* Print the header of the given kernel.
- */
-static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
-	struct gpu_prog *prog, struct ppcg_kernel *kernel)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "__global__ void kernel");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "(");
-	p = print_kernel_arguments(p, prog, kernel, 1);
-	p = isl_printer_print_str(p, ")");
-
-	return p;
-}
-
-/* Print the header of the given kernel to both gen->cuda.kernel_h
- * and gen->cuda.kernel_c.
- */
-static void print_kernel_headers(struct gpu_prog *prog,
-	struct ppcg_kernel *kernel, struct cuda_info *cuda)
-{
-	isl_printer *p;
-
-	p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
-	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
-	p = print_kernel_header(p, prog, kernel);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-	isl_printer_free(p);
-
-	p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
-	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
-	p = print_kernel_header(p, prog, kernel);
-	p = isl_printer_end_line(p);
-	isl_printer_free(p);
-}
-
-static void print_indent(FILE *dst, int indent)
-{
-	fprintf(dst, "%*s", indent, "");
-}
-
-/* Print a list of iterators of type "type" with names "ids" to "out".
- * Each iterator is assigned one of the cuda identifiers in cuda_dims.
- * In particular, the last iterator is assigned the x identifier
- * (the first in the list of cuda identifiers).
- */
-static void print_iterators(FILE *out, const char *type,
-	__isl_keep isl_id_list *ids, const char *cuda_dims[])
-{
-	int i, n;
-
-	n = isl_id_list_n_id(ids);
-	if (n <= 0)
-		return;
-	print_indent(out, 4);
-	fprintf(out, "%s ", type);
-	for (i = 0; i < n; ++i) {
-		isl_id *id;
-
-		if (i)
-			fprintf(out, ", ");
-		id = isl_id_list_get_id(ids, i);
-		fprintf(out, "%s = %s", isl_id_get_name(id),
-			cuda_dims[n - 1 - i]);
-		isl_id_free(id);
-	}
-	fprintf(out, ";\n");
-}
-
-static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
-{
-	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
-	const char *type;
-	const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
-	const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
-					"threadIdx.z" };
-
-	type = isl_options_get_ast_iterator_type(ctx);
-
-	print_iterators(out, type, kernel->block_ids, block_dims);
-	print_iterators(out, type, kernel->thread_ids, thread_dims);
-}
-
-static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
-	struct ppcg_kernel_var *var)
-{
-	int j;
-
-	p = isl_printer_start_line(p);
-	if (var->type == ppcg_access_shared)
-		p = isl_printer_print_str(p, "__shared__ ");
-	p = isl_printer_print_str(p, var->array->type);
-	p = isl_printer_print_str(p, " ");
-	p = isl_printer_print_str(p,  var->name);
-	for (j = 0; j < var->array->n_index; ++j) {
-		isl_val *v;
-
-		p = isl_printer_print_str(p, "[");
-		v = isl_vec_get_element_val(var->size, j);
-		p = isl_printer_print_val(p, v);
-		isl_val_free(v);
-		p = isl_printer_print_str(p, "]");
-	}
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
-	struct ppcg_kernel *kernel)
-{
-	int i;
-
-	for (i = 0; i < kernel->n_var; ++i)
-		p = print_kernel_var(p, &kernel->var[i]);
-
-	return p;
-}
-
-/* Print a sync statement.
- */
-static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "__syncthreads();");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* This function is called for each user statement in the AST,
- * i.e., for each kernel body statement, copy statement or sync statement.
- */
-static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	isl_id *id;
-	struct ppcg_kernel_stmt *stmt;
-
-	id = isl_ast_node_get_annotation(node);
-	stmt = isl_id_get_user(id);
-	isl_id_free(id);
-
-	isl_ast_print_options_free(print_options);
-
-	switch (stmt->type) {
-	case ppcg_kernel_copy:
-		return ppcg_kernel_print_copy(p, stmt);
-	case ppcg_kernel_sync:
-		return print_sync(p, stmt);
-	case ppcg_kernel_domain:
-		return ppcg_kernel_print_domain(p, stmt);
-	}
-
-	return p;
-}
-
-static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
-	struct cuda_info *cuda)
-{
-	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
-	isl_ast_print_options *print_options;
-	isl_printer *p;
-
-	print_kernel_headers(prog, kernel, cuda);
-	fprintf(cuda->kernel_c, "{\n");
-	print_kernel_iterators(cuda->kernel_c, kernel);
-
-	p = isl_printer_to_file(ctx, cuda->kernel_c);
-	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
-	p = isl_printer_indent(p, 4);
-
-	p = print_kernel_vars(p, kernel);
-	p = isl_printer_end_line(p);
-	p = ppcg_set_macro_names(p);
-	p = gpu_print_macros(p, kernel->tree);
-
-	print_options = isl_ast_print_options_alloc(ctx);
-	print_options = isl_ast_print_options_set_print_user(print_options,
-							&print_kernel_stmt, NULL);
-	p = isl_ast_node_print(kernel->tree, p, print_options);
-	isl_printer_free(p);
-
-	fprintf(cuda->kernel_c, "}\n");
-}
-
-/* Print code for initializing the device for execution of the transformed
- * code.  This includes declaring locally defined variables as well as
- * declaring and allocating the required copies of arrays on the device.
- */
-static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	p = print_cuda_macros(p);
-
-	p = gpu_print_local_declarations(p, prog);
-	p = declare_device_arrays(p, prog);
-	p = allocate_device_arrays(p, prog);
-
-	return p;
-}
-
-/* Print code for clearing the device after execution of the transformed code.
- * In particular, free the memory that was allocated on the device.
- */
-static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	p = free_device_arrays(p, prog);
-
-	return p;
-}
-
-/* Print a statement for copying an array to or from the device,
- * or for initializing or clearing the device.
- * The statement identifier of a copying node is called
- * "to_device_<array name>" or "from_device_<array name>" and
- * its user pointer points to the gpu_array_info of the array
- * that needs to be copied.
- * The node for initializing the device is called "init_device".
- * The node for clearing the device is called "clear_device".
- *
- * Extract the array (if any) from the identifier and call
- * init_device, clear_device, copy_array_to_device or copy_array_from_device.
- */
-static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node, struct gpu_prog *prog)
-{
-	isl_ast_expr *expr, *arg;
-	isl_id *id;
-	const char *name;
-	struct gpu_array_info *array;
-
-	expr = isl_ast_node_user_get_expr(node);
-	arg = isl_ast_expr_get_op_arg(expr, 0);
-	id = isl_ast_expr_get_id(arg);
-	name = isl_id_get_name(id);
-	array = isl_id_get_user(id);
-	isl_id_free(id);
-	isl_ast_expr_free(arg);
-	isl_ast_expr_free(expr);
-
-	if (!name)
-		return isl_printer_free(p);
-	if (!strcmp(name, "init_device"))
-		return init_device(p, prog);
-	if (!strcmp(name, "clear_device"))
-		return clear_device(p, prog);
-	if (!array)
-		return isl_printer_free(p);
-
-	if (!prefixcmp(name, "to_device"))
-		return copy_array_to_device(p, array);
-	else
-		return copy_array_from_device(p, array);
-}
-
-struct print_host_user_data {
-	struct cuda_info *cuda;
-	struct gpu_prog *prog;
-};
-
-/* Print the user statement of the host code to "p".
- *
- * The host code may contain original user statements, kernel launches,
- * statements that copy data to/from the device and statements
- * the initialize or clear the device.
- * The original user statements and the kernel launches have
- * an associated annotation, while the other statements do not.
- * The latter are handled by print_device_node.
- * The annotation on the user statements is called "user".
- *
- * In case of a kernel launch, print a block of statements that
- * defines the grid and the block and then launches the kernel.
- */
-__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	isl_id *id;
-	int is_user;
-	struct ppcg_kernel *kernel;
-	struct ppcg_kernel_stmt *stmt;
-	struct print_host_user_data *data;
-
-	isl_ast_print_options_free(print_options);
-
-	data = (struct print_host_user_data *) user;
-
-	id = isl_ast_node_get_annotation(node);
-	if (!id)
-		return print_device_node(p, node, data->prog);
-
-	is_user = !strcmp(isl_id_get_name(id), "user");
-	kernel = is_user ? NULL : isl_id_get_user(id);
-	stmt = is_user ? isl_id_get_user(id) : NULL;
-	isl_id_free(id);
-
-	if (is_user)
-		return ppcg_kernel_print_domain(p, stmt);
-
-	p = ppcg_start_block(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "dim3 k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimBlock");
-	p = print_reverse_list(p, kernel->n_block, kernel->block_dim);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	p = print_grid(p, kernel);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "kernel");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, " <<<k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimGrid, k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimBlock>>> (");
-	p = print_kernel_arguments(p, data->prog, kernel, 0);
-	p = isl_printer_print_str(p, ");");
-	p = isl_printer_end_line(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cudaCheckKernel();");
-	p = isl_printer_end_line(p);
-
-	p = ppcg_end_block(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-
-#if 0
-	print_kernel(data->prog, kernel, data->cuda);
-#endif
-
-	return p;
-}
-
-static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
-	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-	struct cuda_info *cuda)
-{
-	isl_ast_print_options *print_options;
-	isl_ctx *ctx = isl_ast_node_get_ctx(tree);
-	struct print_host_user_data data = { cuda, prog };
-
-	print_options = isl_ast_print_options_alloc(ctx);
-	print_options = isl_ast_print_options_set_print_user(print_options,
-						&print_host_user, &data);
-
-	p = gpu_print_macros(p, tree);
-	p = isl_ast_node_print(tree, p, print_options);
-
-	return p;
-}
-
-/* Given a gpu_prog "prog" and the corresponding transformed AST
- * "tree", print the entire CUDA code to "p".
- * "types" collects the types for which a definition has already
- * been printed.
- */
-static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
-	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-	struct gpu_types *types, void *user)
-{
-	struct cuda_info *cuda = user;
-	isl_printer *kernel;
-
-	kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
-	kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
-	kernel = gpu_print_types(kernel, types, prog);
-	isl_printer_free(kernel);
-
-	if (!kernel)
-		return isl_printer_free(p);
-
-	p = print_host_code(p, prog, tree, cuda);
-
-	return p;
-}
-
-/* Transform the code in the file called "input" by replacing
- * all scops by corresponding CUDA code.
- * The names of the output files are derived from "input".
- *
- * We let generate_gpu do all the hard work and then let it call
- * us back for printing the AST in print_cuda.
- *
- * To prepare for this printing, we first open the output files
- * and we close them after generate_gpu has finished.
- */
-int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input)
-{
-	struct cuda_info cuda;
-	int r;
-
-	cuda_open_files(&cuda, input);
-
-	r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
-
-	cuda_close_files(&cuda);
-
-	return r;
-}
diff --git a/polly/lib/External/ppcg/cuda_common.h b/polly/lib/External/ppcg/cuda_common.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/cuda_common.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _CUDA_COMMON_H_
-#define _CUDA_COMMON_H_
-
-#include <stdio.h>
-
-struct cuda_info {
-	FILE *host_c;
-	FILE *kernel_c;
-	FILE *kernel_h;
-};
-
-void cuda_open_files(struct cuda_info *info, const char *input);
-void cuda_close_files(struct cuda_info *info);
-
-#endif
diff --git a/polly/lib/External/ppcg/cuda_common.c b/polly/lib/External/ppcg/cuda_common.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/cuda_common.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2010      INRIA Saclay
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- */
-
-#include <ctype.h>
-#include <limits.h>
-#include <string.h>
-
-#include "cuda_common.h"
-#include "ppcg.h"
-
-/* Open the host .cu file and the kernel .hu and .cu files for writing.
- * Add the necessary includes.
- */
-void cuda_open_files(struct cuda_info *info, const char *input)
-{
-    char name[PATH_MAX];
-    int len;
-
-    len = ppcg_extract_base_name(name, input);
-
-    strcpy(name + len, "_host.cu");
-    info->host_c = fopen(name, "w");
-
-    strcpy(name + len, "_kernel.cu");
-    info->kernel_c = fopen(name, "w");
-
-    strcpy(name + len, "_kernel.hu");
-    info->kernel_h = fopen(name, "w");
-    fprintf(info->host_c, "#include <assert.h>\n");
-    fprintf(info->host_c, "#include <stdio.h>\n");
-    fprintf(info->host_c, "#include \"%s\"\n", name);
-    fprintf(info->kernel_c, "#include \"%s\"\n", name);
-    fprintf(info->kernel_h, "#include \"cuda.h\"\n\n");
-}
-
-/* Close all output files.
- */
-void cuda_close_files(struct cuda_info *info)
-{
-    fclose(info->kernel_c);
-    fclose(info->kernel_h);
-    fclose(info->host_c);
-}
diff --git a/polly/lib/External/ppcg/external.c b/polly/lib/External/ppcg/external.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/external.c
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <pet.h>
-#include "cpu.h"
-#include "opencl.h"
-
-
-#define die() { \
-  fprintf(stderr, "Dummy function %s called\n", __FUNCTION__); \
-  abort(); \
-}
-
-__isl_give isl_union_map *pet_scop_compute_outer_to_any(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree) {
-  die();
-}
-int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
-  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
-  die();
-}
-isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr) {
-  die();
-}
-isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr) {
-  die();
-}
-isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_must_write(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
-  struct ppcg_scop *ps, struct ppcg_options *options) {
-  die();
-}
-
-__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
-  __isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr) {
-  die();
-}
-unsigned pet_loc_get_start(__isl_keep pet_loc *loc) {
-  die();
-}
-unsigned pet_loc_get_end(__isl_keep pet_loc *loc) {
-  die();
-}
-int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
-  __isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
-    __isl_take pet_scop *scop, void *user), void *user) {
-  die();
-}
-__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
-  __isl_take isl_printer *p) {
-  die();
-}
-__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop) {
-  die();
-}
-__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop) {
-  die();
-}
-int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop) {
-  die();
-}
-int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop) {
-  die();
-}
-int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
-  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
-  die();
-}
-int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
-  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
-  die();
-}
-int pet_stmt_is_kill(struct pet_stmt *stmt) {
-  die();
-}
-struct isl_args pet_options_args;
-const char *ppcg_version(void) {
-  die();
-}
-int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val) {
-  die();
-}
-int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
-  const char *input, const char *output) {
-  die();
-}
-int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
-  const char *input, const char *output) {
-  die();
-}
-__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
-  __isl_keep isl_ast_build *build,
-  __isl_give isl_multi_pw_aff *(*fn_index)(
-    __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
-    void *user), void *user_index,
-  __isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
-    __isl_keep isl_id *id, void *user), void *user_expr) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
-  __isl_keep const char *name) {
-  die();
-}
-__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos) {
-  die();
-}
-__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
-  __isl_take pet_expr *arg) {
-  die();
-}
-__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
-  __isl_take pet_expr *arg) {
-  die();
-}
-__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree) {
-  die();
-}
-__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree) {
-  die();
-}
-__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
-  __isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
-  void *user) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_may_read(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_may_write(
-  __isl_keep pet_expr *expr) {
-  die();
-}
diff --git a/polly/lib/External/ppcg/gpu.h b/polly/lib/External/ppcg/gpu.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu.h
+++ /dev/null
@@ -1,459 +0,0 @@
-#ifndef _GPU_H
-#define _GPU_H
-
-#include <isl/ast.h>
-#include <isl/id.h>
-#include <isl/id_to_ast_expr.h>
-
-#include <pet.h>
-
-#include "ppcg.h"
-#include "ppcg_options.h"
-
-/* An access to an outer array element or an iterator.
- * Accesses to iterators have an access relation that maps to an unnamed space.
- * An access may be both read and write.
- * If the access relation is empty, then the output dimension may
- * not be equal to the dimension of the corresponding array.
- */
-struct gpu_stmt_access {
-	/* Access reads elements */
-	int read;
-	/* Access writes elements */
-	int write;
-	/* All writes are definite writes. */
-	int exact_write;
-	/* Is a single, fixed element being accessed? */
-	isl_bool fixed_element;
-	/* The number of index expressions specified in the access. */
-	int n_index;
-
-	/* May access relation */
-	isl_map *access;
-	/* May access relation with as domain a mapping from iteration domain
-	 * to a reference identifier.
-	 */
-	isl_map *tagged_access;
-	/* The reference id of the corresponding pet_expr. */
-	isl_id *ref_id;
-
-	struct gpu_stmt_access *next;
-};
-
-/* A representation of a user statement.
- * "stmt" points to the corresponding pet statement.
- * "id" is the identifier of the instance set of the statement.
- * "accesses" is a linked list of accesses performed by the statement.
- * If the statement has been killed, i.e., if it will not be scheduled,
- * then this linked list may be empty even if the actual statement does
- * perform accesses.
- */
-struct gpu_stmt {
-	isl_id *id;
-	struct pet_stmt *stmt;
-
-	struct gpu_stmt_access *accesses;
-};
-
-/* Represents an outer array possibly accessed by a gpu_prog.
- */
-struct gpu_array_info {
-	/* The array data space. */
-	isl_space *space;
-	/* Element type. */
-	char *type;
-	/* Element size. */
-	int size;
-	/* Name of the array. */
-	char *name;
-	/* Declared extent of original array. */
-	isl_set *declared_extent;
-	/* AST expression for declared size of original array. */
-	isl_ast_expr *declared_size;
-	/* Extent of the array that needs to be copied. */
-	isl_set *extent;
-	/* Number of indices. */
-	unsigned n_index;
-	/* For each index, a bound on "extent" in that direction. */
-	isl_multi_pw_aff *bound;
-	/* The corresponding access AST expression, if the array needs
-	 * to be allocated on the device.
-	 */
-	isl_ast_expr *bound_expr;
-
-	/* All references to this array; point to elements of a linked list. */
-	int n_ref;
-	struct gpu_stmt_access **refs;
-
-	/* Is this array accessed at all by the program? */
-	int accessed;
-
-	/* Is this a scalar that is read-only within the entire program? */
-	int read_only_scalar;
-
-	/* Are the elements of the array structures? */
-	int has_compound_element;
-
-	/* Are the elements only accessed through constant index expressions? */
-	int only_fixed_element;
-
-	/* Is the array local to the scop? */
-	int local;
-	/* Is the array local and should it be declared on the host? */
-	int declare_local;
-
-	/* Is the corresponding global device memory accessed in any way? */
-	int global;
-
-	/* Should the array be linearized? */
-	int linearize;
-
-	/* Order dependences on this array.
-	 * Only used if live_range_reordering option is set.
-	 * It is set to NULL otherwise.
-	 */
-	isl_union_map *dep_order;
-
-    void *user;
-};
-
-/* Represents an outer array accessed by a ppcg_kernel, localized
- * to the context of this kernel.
- *
- * "array" points to the corresponding array in the gpu_prog.
- * The "n_group" "groups" are the reference groups associated to the array.
- * If "force_private" is set, then the array (in practice a scalar)
- * must be mapped to a register.
- * "global" is set if the global device memory corresponding
- * to this array is accessed by the kernel.
- * "bound" is equal to array->bound specialized to the current kernel.
- * "bound_expr" is the corresponding access AST expression.
- */
-struct gpu_local_array_info {
-	struct gpu_array_info *array;
-
-	int n_group;
-	struct gpu_array_ref_group **groups;
-
-	int force_private;
-	int global;
-
-	unsigned n_index;
-	isl_multi_pw_aff *bound;
-	isl_ast_expr *bound_expr;
-};
-
-__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
-	struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);
-
-/* A sequence of "n" names of types.
- */
-struct gpu_types {
-	int n;
-	char **name;
-};
-
-/* "read" and "write" contain the original access relations, possibly
- * involving member accesses.
- *
- * The elements of "array", as well as the ranges of "copy_in" and "copy_out"
- * only refer to the outer arrays of any possible member accesses.
- */
-struct gpu_prog {
-	isl_ctx *ctx;
-
-	struct ppcg_scop *scop;
-
-	/* Set of parameter values */
-	isl_set *context;
-
-	/* All potential read accesses in the entire program */
-	isl_union_map *read;
-
-	/* All potential write accesses in the entire program */
-	isl_union_map *may_write;
-	/* All definite write accesses in the entire program */
-	isl_union_map *must_write;
-	/* All tagged definite kills in the entire program */
-	isl_union_map *tagged_must_kill;
-
-	/* The set of inner array elements that may be preserved. */
-	isl_union_set *may_persist;
-
-	/* A mapping from all innermost arrays to their outer arrays. */
-	isl_union_map *to_outer;
-	/* A mapping from the outer arrays to all corresponding inner arrays. */
-	isl_union_map *to_inner;
-	/* A mapping from all intermediate arrays to their outer arrays,
-	 * including an identity mapping from the anonymous 1D space to itself.
-	 */
-	isl_union_map *any_to_outer;
-
-	/* Order dependences on non-scalars. */
-	isl_union_map *array_order;
-
-	/* Array of statements */
-	int n_stmts;
-	struct gpu_stmt *stmts;
-
-	int n_array;
-	struct gpu_array_info *array;
-};
-
-struct gpu_gen {
-	isl_ctx *ctx;
-	struct ppcg_options *options;
-
-	/* Callback for printing of AST in appropriate format. */
-	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
-		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-		struct gpu_types *types, void *user);
-	void *print_user;
-
-    isl_id_to_ast_expr *(*build_ast_expr)(void *stmt,
-            isl_ast_build *build,
-            isl_multi_pw_aff *(*fn_index)(
-                __isl_take isl_multi_pw_aff *mpa, isl_id *id,
-                void *user),
-            void *user_index,
-            isl_ast_expr *(*fn_expr)(isl_ast_expr *expr,
-                isl_id *id, void *user),
-        void *user_expr);
-
-	struct gpu_prog *prog;
-	/* The generated AST. */
-	isl_ast_node *tree;
-
-	/* The sequence of types for which a definition has been printed. */
-	struct gpu_types types;
-
-	/* User specified tile, grid and block sizes for each kernel */
-	isl_union_map *sizes;
-
-	/* Effectively used tile, grid and block sizes for each kernel */
-	isl_union_map *used_sizes;
-
-	/* Identifier of the next kernel. */
-	int kernel_id;
-};
-
-enum ppcg_group_access_type {
-	ppcg_access_global,
-	ppcg_access_shared,
-	ppcg_access_private
-};
-
-enum ppcg_kernel_stmt_type {
-	ppcg_kernel_copy,
-	ppcg_kernel_domain,
-	ppcg_kernel_sync
-};
-
-/* Representation of special statements, in particular copy statements
- * and __syncthreads statements, inside a kernel.
- *
- * type represents the kind of statement
- *
- *
- * for ppcg_kernel_copy statements we have
- *
- * read is set if the statement should copy data from global memory
- * to shared memory or registers.
- *
- * index expresses an access to the array element that needs to be copied
- * local_index expresses the corresponding element in the tile
- *
- * array refers to the original array being copied
- * local_array is a pointer to the appropriate element in the "array"
- *	array of the ppcg_kernel to which this copy access belongs
- *
- *
- * for ppcg_kernel_domain statements we have
- *
- * stmt is the corresponding input statement
- *
- * n_access is the number of accesses in stmt
- * access is an array of local information about the accesses
- */
-struct ppcg_kernel_stmt {
-	enum ppcg_kernel_stmt_type type;
-
-	union {
-		struct {
-			int read;
-			isl_ast_expr *index;
-			isl_ast_expr *local_index;
-			struct gpu_array_info *array;
-			struct gpu_local_array_info *local_array;
-		} c;
-		struct {
-			struct gpu_stmt *stmt;
-			isl_id_to_ast_expr *ref2expr;
-		} d;
-	} u;
-};
-
-/* Representation of a local variable in a kernel.
- */
-struct ppcg_kernel_var {
-	struct gpu_array_info *array;
-	enum ppcg_group_access_type type;
-	char *name;
-	isl_vec *size;
-};
-
-/* Representation of a kernel.
- *
- * prog describes the original code from which the kernel is extracted.
- *
- * id is the sequence number of the kernel.
- *
- * block_ids contains the list of block identifiers for this kernel.
- * thread_ids contains the list of thread identifiers for this kernel.
- *
- * the first n_grid elements of grid_dim represent the specified size
- * of the grid.
- * the first n_block elements of block_dim represent the specified or
- * effective size of the block.
- * Note that in the input file, the sizes of the grid and the blocks
- * are specified in the order x, y, z, but internally, the sizes
- * are stored in reverse order, so that the last element always
- * refers to the x dimension.
- *
- * grid_size reflects the effective grid size.
- * grid_size_expr contains a corresponding access AST expression, built within
- * the context where the launch appears.
- *
- * context contains the values of the parameters and outer schedule dimensions
- * for which any statement instance in this kernel needs to be executed.
- *
- * n_sync is the number of synchronization operations that have
- * been introduced in the schedule tree corresponding to this kernel (so far).
- *
- * core contains the spaces of the statement domains that form
- * the core computation of the kernel.  It is used to navigate
- * the tree during the construction of the device part of the schedule
- * tree in gpu_create_kernel.
- *
- * expanded_domain contains the original statement instances,
- * i.e., those that appear in the domains of access relations,
- * that are involved in the kernel.
- * contraction maps those original statement instances to
- * the statement instances that are active at the point
- * in the schedule tree where the kernel is created.
- *
- * arrays is the set of possibly accessed outer array elements.
- *
- * space is the schedule space of the AST context.  That is, it represents
- * the loops of the generated host code containing the kernel launch.
- *
- * n_array is the total number of arrays in the input program and also
- * the number of element in the array array.
- * array contains information about each array that is local
- * to the current kernel.  If an array is not used in a kernel,
- * then the corresponding entry does not contain any information.
- *
- * any_force_private is set if any array in the kernel is marked force_private
- *
- * block_filter contains constraints on the domain elements in the kernel
- * that encode the mapping to block identifiers, where the block identifiers
- * are represented by "n_grid" parameters with as names the elements
- * of "block_ids".
- *
- * thread_filter contains constraints on the domain elements in the kernel
- * that encode the mapping to thread identifiers, where the thread identifiers
- * are represented by "n_block" parameters with as names the elements
- * of "thread_ids".
- *
- * copy_schedule corresponds to the schedule dimensions of
- * the (tiled) schedule for this kernel that have been taken into account
- * for computing private/shared memory tiles.
- * The domain corresponds to the original statement instances, i.e.,
- * those that appear in the leaves of the schedule tree.
- * copy_schedule_dim is the dimension of this schedule.
- *
- * sync_writes contains write references that require synchronization.
- * Each reference is represented by a universe set in a space [S[i,j] -> R[]]
- * with S[i,j] the statement instance space and R[] the array reference.
- */
-struct ppcg_kernel {
-	isl_ctx *ctx;
-	struct ppcg_options *options;
-
-	struct gpu_prog *prog;
-
-	int id;
-
-	isl_id_list *block_ids;
-	isl_id_list *thread_ids;
-
-	int n_grid;
-	int n_block;
-	int grid_dim[2];
-	int block_dim[3];
-
-	isl_multi_pw_aff *grid_size;
-	isl_ast_expr *grid_size_expr;
-	isl_set *context;
-
-	int n_sync;
-	isl_union_set *core;
-	isl_union_set *arrays;
-
-	isl_union_pw_multi_aff *contraction;
-	isl_union_set *expanded_domain;
-
-	isl_space *space;
-
-	int n_array;
-	struct gpu_local_array_info *array;
-
-	int n_var;
-	struct ppcg_kernel_var *var;
-
-	int any_force_private;
-
-	isl_union_set *block_filter;
-	isl_union_set *thread_filter;
-	isl_union_pw_multi_aff *copy_schedule;
-	int copy_schedule_dim;
-
-	isl_union_set *sync_writes;
-
-	isl_ast_node *tree;
-};
-
-int gpu_array_is_scalar(struct gpu_array_info *array);
-int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
-int gpu_array_requires_device_allocation(struct gpu_array_info *array);
-__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
-isl_bool gpu_array_can_be_private(struct gpu_array_info *array);
-
-struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
-void *gpu_prog_free(struct gpu_prog *prog);
-
-int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);
-
-int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
-	struct ppcg_options *options,
-	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
-		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-		struct gpu_types *types, void *user), void *user);
-
-__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node, int scale,
-	__isl_keep isl_multi_val *sizes);
-
-__isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
-int has_any_permutable_node(__isl_keep isl_schedule *schedule);
-__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
-                                       __isl_take isl_schedule *schedule,
-                                      int to_from_device);
-__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
-                                       __isl_take isl_schedule *schedule);
-
-__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
-void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
-void collect_order_dependences(struct gpu_prog *prog);
-isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
-#endif
diff --git a/polly/lib/External/ppcg/gpu.c b/polly/lib/External/ppcg/gpu.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu.c
+++ /dev/null
@@ -1,5849 +0,0 @@
-/*
- * Copyright 2010-2011 INRIA Saclay
- * Copyright 2012-2013 Ecole Normale Superieure
- * Copyright 2015-2016 Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <isl/polynomial.h>
-#include <isl/union_set.h>
-#include <isl/aff.h>
-#include <isl/ilp.h>
-#include <isl/flow.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-#include <isl/options.h>
-#include <isl/ast_build.h>
-
-#include "cpu.h"
-#include "gpu.h"
-#include "gpu_array_tile.h"
-#include "gpu_group.h"
-#include "gpu_hybrid.h"
-#include "gpu_tree.h"
-#include "hybrid.h"
-#include "schedule.h"
-#include "ppcg_options.h"
-#include "print.h"
-#include "util.h"
-
-struct gpu_array_info;
-
-/* Return the name of the outer array (of structs) accessed by "access".
- */
-static const char *get_outer_array_name(__isl_keep isl_map *access)
-{
-	isl_space *space;
-	const char *name;
-
-	space = isl_space_range(isl_map_get_space(access));
-	while (space && isl_space_is_wrapping(space))
-		space = isl_space_domain(isl_space_unwrap(space));
-	name = isl_space_get_tuple_name(space, isl_dim_set);
-	isl_space_free(space);
-
-	return name;
-}
-
-/* Collect all references to the given array and store pointers to them
- * in array->refs.
- */
-void collect_references(struct gpu_prog *prog,
-	struct gpu_array_info *array)
-{
-	int i;
-	int n;
-
-	n = 0;
-	for (i = 0; i < prog->n_stmts; ++i) {
-		struct gpu_stmt *stmt = &prog->stmts[i];
-		struct gpu_stmt_access *access;
-
-		for (access = stmt->accesses; access; access = access->next) {
-			const char *name;
-			name = get_outer_array_name(access->access);
-			if (name && !strcmp(array->name, name))
-				n++;
-		}
-	}
-
-	array->n_ref = n;
-	array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
-	assert(array->refs);
-
-	n = 0;
-	for (i = 0; i < prog->n_stmts; ++i) {
-		struct gpu_stmt *stmt = &prog->stmts[i];
-		struct gpu_stmt_access *access;
-
-		for (access = stmt->accesses; access; access = access->next) {
-			const char *name;
-			name = get_outer_array_name(access->access);
-			if (!name || strcmp(array->name, name))
-				continue;
-
-			array->refs[n++] = access;
-		}
-	}
-}
-
-/* Compute and return the extent of "array", taking into account the set of
- * accessed elements.
- *
- * In particular, the extent in the outer dimension is taken
- * from "accessed", while the extents in the remaining dimensions
- * are taken from array->extent.
- *
- * The extent in the outer dimension cannot be taken from array->extent
- * because that may be unbounded.  Furthermore, even if it is bounded,
- * it may be larger than the piece of the array that is being accessed.
- */
-static __isl_give isl_set *compute_extent(struct pet_array *array,
-	__isl_keep isl_set *accessed)
-{
-	int n_index;
-	isl_id *id;
-	isl_set *outer;
-	isl_set *extent;
-
-	extent = isl_set_copy(array->extent);
-
-	n_index = isl_set_dim(accessed, isl_dim_set);
-	if (n_index == 0)
-		return extent;
-
-	extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
-	outer = isl_set_copy(accessed);
-	outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
-	extent = isl_set_flat_product(outer, extent);
-	id = isl_set_get_tuple_id(accessed);
-	extent = isl_set_set_tuple_id(extent, id);
-
-	return extent;
-}
-
-/* Is the array "array" being extracted a read-only scalar?
- *
- * That is, is "array" a scalar that is never possibly written to.
- * An array containing structures is never considered to be a scalar.
- */
-static int is_read_only_scalar(struct gpu_array_info *array,
-	struct gpu_prog *prog)
-{
-	isl_set *space;
-	isl_union_map *write;
-	int empty;
-
-	if (array->has_compound_element)
-		return 0;
-	if (array->n_index != 0)
-		return 0;
-
-	write = isl_union_map_copy(prog->may_write);
-	space = isl_set_universe(isl_space_copy(array->space));
-	write = isl_union_map_intersect_range(write,
-						isl_union_set_from_set(space));
-	empty = isl_union_map_is_empty(write);
-	isl_union_map_free(write);
-
-	return empty;
-}
-
-/* Is "array" only accessed as individual, fixed elements?
- * That is, does each access to "array" access a single, fixed element?
- */
-isl_bool only_fixed_element_accessed(struct gpu_array_info *array)
-{
-	int i;
-
-	for (i = 0; i < array->n_ref; ++i)
-		if (!array->refs[i]->fixed_element)
-			return isl_bool_false;
-
-	return isl_bool_true;
-}
-
-/* Compute bounds on the host array "pa" based on the corresponding
- * accessed elements in "arrays"
- * and collect all references to the array.
- * Store the results in "info".
- *
- * If the array is zero-dimensional and does not contain structures,
- * i.e., if the array is a scalar, we check whether it is read-only.
- * We also check whether the array is accessed at all.
- */
-static int extract_array_info(struct gpu_prog *prog,
-	struct gpu_array_info *info, struct pet_array *pa,
-	__isl_keep isl_union_set *arrays)
-{
-	int empty;
-	const char *name;
-	int n_index;
-	isl_multi_pw_aff *bounds;
-	isl_set *accessed, *extent;
-
-	n_index = isl_set_dim(pa->extent, isl_dim_set);
-	name = isl_set_get_tuple_name(pa->extent);
-
-	info->space = isl_set_get_space(pa->extent);
-	info->name = strdup(name);
-	info->n_index = n_index;
-	info->linearize = prog->scop->options->linearize_device_arrays;
-
-	info->type = strdup(pa->element_type);
-	info->size = pa->element_size;
-	info->local = pa->declared && !pa->exposed;
-	info->has_compound_element = pa->element_is_record;
-	info->read_only_scalar = is_read_only_scalar(info, prog);
-
-	info->declared_extent = isl_set_copy(pa->extent);
-	accessed = isl_union_set_extract_set(arrays,
-					    isl_space_copy(info->space));
-	empty = isl_set_is_empty(accessed);
-	extent = compute_extent(pa, accessed);
-	isl_set_free(accessed);
-	info->extent = extent;
-	if (empty < 0)
-		return -1;
-	info->accessed = !empty;
-	bounds = ppcg_size_from_extent(isl_set_copy(extent));
-	bounds = isl_multi_pw_aff_gist(bounds, isl_set_copy(prog->context));
-	if (!bounds)
-		return -1;
-	if (!isl_multi_pw_aff_is_cst(bounds))
-		info->linearize = 1;
-	info->bound = bounds;
-
-	collect_references(prog, info);
-	info->only_fixed_element = only_fixed_element_accessed(info);
-
-	return 0;
-}
-
-/* Remove independence from the order constraints "order" on array "array".
- * Since the pairs of iterations in the filter relation of an independence
- * are guaranteed to be completely independent by the user, there is
- * no need to ensure that live ranges are ordered along those pairs.
- * We make an exception for local variables, though, as the independence
- * guarantee does not apply to those.
- *
- * The order constraints are used in two places.
- * Those on scalars are used in check_scalar_live_ranges to check if
- * we need to force the scalar to be private.  Any non-local scalar
- * should not be forced scalar if it only appears in independent loops.
- * Those on non-scalars are added to the coincidence constraints
- * in compute_schedule because we do not support any array expansion.
- * Accesses to non-local arrays should not prevent a loop from being
- * considered coincident so we should indeed remove those constraints
- * from the order constraints.
- */
-static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog,
-	struct gpu_array_info *array, __isl_take isl_union_map *order)
-{
-	// We do not have independence information in Polly. Hence, make this
-	// function a no-op.
-	return order;
-	int i;
-
-	for (i = 0; i < prog->scop->pet->n_independence; ++i) {
-		struct pet_independence *pi = prog->scop->pet->independences[i];
-		if (isl_union_set_contains(pi->local, array->space))
-			continue;
-
-		order = isl_union_map_subtract(order,
-						isl_union_map_copy(pi->filter));
-	}
-
-	return order;
-}
-
-/* For each array in "prog", store the (untagged) order dependences
- * derived from the array in array->dep_order.
- * In particular, consider all references that access the given array
- * and take the order dependences that have one of these references
- * as source.  (Since an order dependence relates two references to
- * the same array, the target of these order dependences will also
- * be one of these references.)
- * Additionally, store the union of these array->dep_order relations
- * for all arrays that cannot be mapped to private memory in prog->array_order.
- */
-void collect_order_dependences(struct gpu_prog *prog)
-{
-	int i;
-	isl_space *space;
-	isl_union_map *accesses;
-
-	space = isl_union_map_get_space(prog->read);
-	prog->array_order = isl_union_map_empty(space);
-
-	accesses = isl_union_map_copy(prog->scop->tagged_reads);
-	accesses = isl_union_map_union(accesses,
-			    isl_union_map_copy(prog->scop->tagged_may_writes));
-	accesses = isl_union_map_universe(accesses);
-	accesses = isl_union_map_apply_range(accesses,
-					    isl_union_map_copy(prog->to_outer));
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-		isl_set *set;
-		isl_union_set *uset;
-		isl_union_map *order;
-
-		set = isl_set_universe(isl_space_copy(array->space));
-		uset = isl_union_set_from_set(set);
-		uset = isl_union_map_domain(
-		    isl_union_map_intersect_range(isl_union_map_copy(accesses),
-						    uset));
-		order = isl_union_map_copy(prog->scop->tagged_dep_order);
-		order = isl_union_map_intersect_domain(order, uset);
-		order = isl_union_map_zip(order);
-		order = isl_union_set_unwrap(isl_union_map_domain(order));
-		order = remove_independences(prog, array, order);
-		array->dep_order = order;
-
-		if (gpu_array_can_be_private(array))
-			continue;
-
-		prog->array_order = isl_union_map_union(prog->array_order,
-					isl_union_map_copy(array->dep_order));
-	}
-
-	isl_union_map_free(accesses);
-}
-
-/* Construct a gpu_array_info for each array referenced by prog->scop and
- * collect them in prog->array.
- *
- * The sizes are based on the extents and the set of possibly accessed
- * elements by "prog".
- * If there are any member accesses involved, then they are first mapped
- * to the outer arrays of structs.
- * Only extract gpu_array_info entries for these outer arrays.
- *
- * If we are allowing live range reordering, then also set
- * the dep_order field.  Otherwise leave it NULL.
- */
-static int collect_array_info(struct gpu_prog *prog)
-{
-	int i;
-	int r = 0;
-	isl_union_set *arrays;
-
-	arrays = isl_union_map_range(isl_union_map_copy(prog->read));
-	arrays = isl_union_set_union(arrays,
-		    isl_union_map_range(isl_union_map_copy(prog->may_write)));
-
-	arrays = isl_union_set_apply(arrays,
-					isl_union_map_copy(prog->to_outer));
-
-	arrays = isl_union_set_coalesce(arrays);
-
-	prog->n_array = prog->scop->pet->n_array;
-	prog->array = isl_calloc_array(prog->ctx,
-				     struct gpu_array_info, prog->n_array);
-	assert(prog->array);
-	prog->n_array = 0;
-	for (i = 0; i < prog->scop->pet->n_array; ++i) {
-		isl_bool field;
-
-		field = isl_set_is_wrapping(prog->scop->pet->arrays[i]->extent);
-		if (field < 0)
-			break;
-		if (field)
-			continue;
-		if (extract_array_info(prog, &prog->array[prog->n_array++],
-					prog->scop->pet->arrays[i], arrays) < 0)
-			r = -1;
-	}
-	if (i < prog->scop->pet->n_array)
-		r = -1;
-
-	isl_union_set_free(arrays);
-
-	if (prog->scop->options->live_range_reordering)
-		collect_order_dependences(prog);
-
-	return r;
-}
-
-static void free_array_info(struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		free(prog->array[i].type);
-		free(prog->array[i].name);
-		isl_multi_pw_aff_free(prog->array[i].bound);
-		isl_ast_expr_free(prog->array[i].bound_expr);
-		isl_space_free(prog->array[i].space);
-		isl_set_free(prog->array[i].declared_extent);
-		isl_set_free(prog->array[i].extent);
-		isl_ast_expr_free(prog->array[i].declared_size);
-		free(prog->array[i].refs);
-		isl_union_map_free(prog->array[i].dep_order);
-	}
-	free(prog->array);
-}
-
-/* Check if a gpu array is a scalar.  A scalar is a value that is not stored
- * as an array or through a pointer reference, but as a single data element.
- * At the moment, scalars are represented as zero-dimensional arrays.
- * Note that the single data element may be an entire structure.
- */
-int gpu_array_is_scalar(struct gpu_array_info *array)
-{
-	return array->n_index == 0;
-}
-
-/* Can "array" be mapped to private memory?
- * That is, is it only accessed as individual elements with
- * constant index expressions?
- */
-isl_bool gpu_array_can_be_private(struct gpu_array_info *array)
-{
-	if (!array)
-		return isl_bool_error;
-	return array->only_fixed_element;
-}
-
-/* Is "array" a read-only scalar?
- */
-int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
-{
-	return array->read_only_scalar;
-}
-
-/* Does "array" need to be allocated on the device?
- * If it is a read-only scalar, then it will be passed as an argument
- * to the kernel and therefore does not require any allocation.
- * If this device memory is not accessed at all, then it does not
- * need to be allocated either.
- */
-int gpu_array_requires_device_allocation(struct gpu_array_info *array)
-{
-	if (gpu_array_is_read_only_scalar(array))
-		return 0;
-	if (!array->global)
-		return 0;
-	return 1;
-}
-
-/* Return the set of parameter values for which the array has a positive
- * size in all dimensions.
- * If the sizes are only valid for some parameter values, then those
- * constraints are also taken into account.
- */
-__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array)
-{
-	int i;
-	isl_space *space;
-	isl_set *guard;
-
-	if (!array)
-		return NULL;
-
-	space = isl_space_params(isl_space_copy(array->space));
-	guard = isl_set_universe(space);
-
-	for (i = 0; i < array->n_index; ++i) {
-		isl_pw_aff *bound;
-		isl_set *guard_i, *zero;
-
-		bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
-		guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound));
-		zero = isl_pw_aff_zero_set(bound);
-		guard_i = isl_set_subtract(guard_i, zero);
-		guard = isl_set_intersect(guard, guard_i);
-	}
-
-	return guard;
-}
-
-/* Internal data structure for extract_size_of_type.
- * "type" specifies the name of the space that we want to extract.
- * "res" is used to store the subset of that space.
- */
-struct ppcg_extract_size_data {
-	const char *type;
-	isl_set *res;
-};
-
-/* This function is called for each set in a union_set.
- * If the name of the set matches data->type, we store the
- * set in data->res.
- */
-static isl_stat extract_size_of_type(__isl_take isl_set *size, void *user)
-{
-	struct ppcg_extract_size_data *data = user;
-	const char *name;
-
-	name = isl_set_get_tuple_name(size);
-	if (name && !strcmp(name, data->type)) {
-		data->res = size;
-		return isl_stat_error;
-	}
-
-	isl_set_free(size);
-	return isl_stat_ok;
-}
-
-/* Given a union map { kernel[i] -> *[...] },
- * return the range in the space called "type" for the kernel with
- * sequence number "id".
- */
-static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
-	const char *type, int id)
-{
-	isl_space *space;
-	isl_set *dom;
-	isl_union_set *local_sizes;
-	struct ppcg_extract_size_data data = { type, NULL };
-
-	if (!sizes)
-		return NULL;
-
-	space = isl_union_map_get_space(sizes);
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, 1);
-	space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
-	dom = isl_set_universe(space);
-	dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
-
-	local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
-					isl_union_map_copy(sizes));
-	isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
-	isl_union_set_free(local_sizes);
-	return data.res;
-}
-
-/* Given a singleton set, extract the first (at most *len) elements
- * of the single integer tuple into *sizes and update *len if needed.
- */
-static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
-{
-	int i;
-	int dim;
-
-	if (!set)
-		return;
-
-	dim = isl_set_dim(set, isl_dim_set);
-	if (dim < *len)
-		*len = dim;
-
-	for (i = 0; i < *len; ++i) {
-		isl_val *v;
-
-		v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
-		assert(v);
-
-		sizes[i] = isl_val_get_num_si(v);
-		isl_val_free(v);
-	}
-
-	isl_set_free(set);
-}
-
-/* Add the map { kernel[id] -> type[sizes] } to gen->used_sizes,
- * if the option debug->dump_sizes is set.
- */
-static void set_used_sizes(struct gpu_gen *gen, const char *type, int id,
-	int *sizes, int len)
-{
-	int i;
-	isl_space *space;
-	isl_map *map;
-
-	if (!gen->options->debug->dump_sizes)
-		return;
-
-	space = isl_union_map_get_space(gen->used_sizes);
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, 1);
-	space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
-	space = isl_space_from_domain(space);
-	space = isl_space_add_dims(space, isl_dim_out, len);
-	space = isl_space_set_tuple_name(space, isl_dim_out, type);
-
-	map = isl_map_universe(space);
-	map = isl_map_fix_si(map, isl_dim_in, 0, id);
-	for (i = 0; i < len; ++i)
-		map = isl_map_fix_si(map, isl_dim_out, i, sizes[i]);
-
-	gen->used_sizes = isl_union_map_add_map(gen->used_sizes, map);
-}
-
-/* Extract user specified "tile" sizes from the "sizes" command line option,
- * defaulting to option->tile_size in each dimension.
- * *tile_len contains the maximum number of tile sizes needed.
- * Update *tile_len to the number of specified tile sizes, if any, and
- * return a pointer to the tile sizes (or NULL on error).
- * Add the effectively used sizes to gen->used_sizes.
- */
-static int *read_tile_sizes(struct gpu_gen *gen, int *tile_len)
-{
-	int n;
-	int *tile_size;
-	isl_set *size;
-
-	tile_size = isl_alloc_array(gen->ctx, int, *tile_len);
-	if (!tile_size)
-		return NULL;
-	for (n = 0; n < *tile_len; ++n)
-		tile_size[n] = gen->options->tile_size;
-
-	size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
-	read_sizes_from_set(size, tile_size, tile_len);
-	set_used_sizes(gen, "tile", gen->kernel_id, tile_size, *tile_len);
-
-	return tile_size;
-}
-
-/* Extract user specified "block" sizes from the "sizes" command line option,
- * after filling in some potentially useful defaults.
- */
-static void read_block_sizes(struct ppcg_kernel *kernel,
-	__isl_keep isl_union_map *sizes)
-{
-	isl_set *size;
-
-	if (kernel->n_block > 3)
-		kernel->n_block = 3;
-	switch (kernel->n_block) {
-	case 1:
-		kernel->block_dim[0] = 512;
-		break;
-	case 2:
-		kernel->block_dim[0] = 32;
-		kernel->block_dim[1] = 16;
-		break;
-	default:
-		kernel->block_dim[0] = 32;
-		kernel->block_dim[1] = 4;
-		kernel->block_dim[2] = 4;
-		break;
-	}
-
-	size = extract_sizes(sizes, "block", kernel->id);
-	read_sizes_from_set(size, kernel->block_dim, &kernel->n_block);
-}
-
-/* Extract user specified "grid" sizes from the "sizes" command line option,
- * after filling in some potentially useful defaults.
- */
-static void read_grid_sizes(struct ppcg_kernel *kernel,
-	__isl_keep isl_union_map *sizes)
-{
-	isl_set *size;
-
-	if (kernel->n_grid > 2)
-		kernel->n_grid = 2;
-	switch (kernel->n_grid) {
-	case 1:
-		kernel->grid_dim[0] = 32768;
-		break;
-	default:
-		kernel->grid_dim[0] = 256;
-		kernel->grid_dim[1] = 256;
-		break;
-	}
-
-	size = extract_sizes(sizes, "grid", kernel->id);
-	read_sizes_from_set(size, kernel->grid_dim, &kernel->n_grid);
-}
-
-/* Extract user specified grid and block sizes from the gen->sizes
- * command line option after filling in some potentially useful defaults.
- * Store the extracted sizes in "kernel".
- * Add the effectively used sizes to gen->used_sizes.
- */
-static void read_grid_and_block_sizes(struct ppcg_kernel *kernel,
-	struct gpu_gen *gen)
-{
-	read_block_sizes(kernel, gen->sizes);
-	read_grid_sizes(kernel, gen->sizes);
-	set_used_sizes(gen, "block", kernel->id,
-					    kernel->block_dim, kernel->n_block);
-	set_used_sizes(gen, "grid", kernel->id,
-					    kernel->grid_dim, kernel->n_grid);
-}
-
-static void *free_stmts(struct gpu_stmt *stmts, int n)
-{
-	int i;
-
-	if (!stmts)
-		return NULL;
-
-	for (i = 0; i < n; ++i) {
-		struct gpu_stmt_access *access, *next;
-
-		for (access = stmts[i].accesses; access; access = next) {
-			next = access->next;
-			isl_id_free(access->ref_id);
-			isl_map_free(access->access);
-			isl_map_free(access->tagged_access);
-			free(access);
-		}
-
-		isl_id_free(stmts[i].id);
-	}
-	free(stmts);
-
-	return NULL;
-}
-
-/* Add parameters p[i] with identifiers "ids" to "set",
- * with bounds to 0 <= p[i] < size[i].
- */
-__isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
-	int *size, __isl_keep isl_id_list *ids)
-{
-	int i, len;
-	unsigned nparam;
-
-	len = isl_id_list_n_id(ids);
-	nparam = isl_set_dim(set, isl_dim_param);
-	set = isl_set_add_dims(set, isl_dim_param, len);
-
-	for (i = 0; i < len; ++i) {
-		isl_id *id;
-
-		id = isl_id_list_get_id(ids, i);
-		set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
-		set = isl_set_lower_bound_si(set, isl_dim_param, nparam + i, 0);
-		set = isl_set_upper_bound_si(set, isl_dim_param,
-					    nparam + i, size[i] - 1);
-	}
-
-	return set;
-}
-
-/* Add "len" parameters p[i] with identifiers "ids" and intersect "set"
- * with
- *
- *	{ : 0 <= p[i] < size[i] }
- *
- * or an overapproximation.
- */
-static __isl_give isl_set *add_bounded_parameters_dynamic(
-	__isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
-	__isl_keep isl_id_list *ids)
-{
-	int i, len;
-	unsigned nparam;
-	isl_space *space;
-	isl_local_space *ls;
-
-	len = isl_multi_pw_aff_dim(size, isl_dim_out);
-	nparam = isl_set_dim(set, isl_dim_param);
-	set = isl_set_add_dims(set, isl_dim_param, len);
-
-	for (i = 0; i < len; ++i) {
-		isl_id *id;
-
-		id = isl_id_list_get_id(ids, i);
-		set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
-	}
-
-	space = isl_space_params(isl_set_get_space(set));
-	ls = isl_local_space_from_space(space);
-	for (i = 0; i < len; ++i) {
-		isl_pw_aff *param, *size_i, *zero;
-		isl_set *bound;
-
-		param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
-						isl_dim_param, nparam + i);
-
-		size_i = isl_multi_pw_aff_get_pw_aff(size, i);
-		bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
-		bound = isl_set_from_basic_set(isl_set_simple_hull(bound));
-		set = isl_set_intersect_params(set, bound);
-
-		zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
-		bound = isl_pw_aff_ge_set(param, zero);
-		set = isl_set_intersect_params(set, bound);
-	}
-	isl_local_space_free(ls);
-
-	return set;
-}
-
-/* Return the union of all tagged access relations in the group.
- */
-static __isl_give isl_union_map *group_tagged_access_relation(
-	struct gpu_array_ref_group *group)
-{
-	int i;
-	isl_union_map *access;
-
-	access = isl_union_map_empty(isl_map_get_space(group->access));
-	for (i = 0; i < group->n_ref; ++i) {
-		isl_map *map_i;
-
-		map_i = isl_map_copy(group->refs[i]->tagged_access);
-		access = isl_union_map_union(access,
-					    isl_union_map_from_map(map_i));
-	}
-
-	return access;
-}
-
-/* Return the extent of "array", recomputed from the bounds.
- * The recomputed extent may be simpler than the original extent.
- */
-static __isl_give isl_set *array_extent(struct gpu_array_info *array)
-{
-	int i;
-	isl_id *id;
-	isl_space *space;
-	isl_local_space *ls;
-	isl_set *extent;
-
-	id = isl_set_get_tuple_id(array->extent);
-	space = isl_set_get_space(array->extent);
-	extent = isl_set_universe(isl_space_copy(space));
-	ls = isl_local_space_from_space(space);
-	for (i = 0; i < array->n_index; ++i) {
-		isl_pw_aff *bound;
-		isl_aff *aff;
-		isl_pw_aff *index;
-		isl_set *lt;
-
-		extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
-
-		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
-						isl_dim_set, i);
-		index = isl_pw_aff_from_aff(aff);
-		bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
-		bound = isl_pw_aff_from_range(bound);
-		bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
-		bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
-						isl_id_copy(id));
-		lt = isl_pw_aff_lt_set(index, bound);
-		extent = isl_set_intersect(extent, lt);
-	}
-	isl_local_space_free(ls);
-	isl_id_free(id);
-
-	return extent;
-}
-
-/* Return a map from the first group->shared_tile->depth dimensions
- * of the computed schedule to the array tile in
- * global memory that corresponds to the shared memory copy.
- *
- * In particular, return a map
- *
- *	{ D[i] -> A[a] }
- *
- * with constraints
- *
- *	tile_offset(i) <= a <= tile_offset(i) + tile_size - 1		(1)
- *
- * and
- *
- *	0 <= a <= array_size - 1					(2)
- *
- * Note that if some stride has been detected (i.e., when
- * group->shared_tile->bound[i].shift is set), then a in (1) refers
- * to the shifted and scaled down version.
- *
- * Constraints (1) are obtained by mapping the size constraints on the
- * shared/private memory tile back to the access relation.
- * Constraints (2) are obtained from the (recomputed) extent.
- */
-static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
-{
-	int i;
-	int n_index = group->array->n_index;
-	isl_map *tile;
-	isl_space *space;
-	isl_set *local;
-	isl_set *extent;
-
-	space = isl_multi_aff_get_space(group->shared_tile->tiling);
-	space = isl_space_range(space);
-	local = isl_set_universe(space);
-	for (i = 0; i < n_index; ++i) {
-		isl_val *bound;
-
-		local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
-		bound = isl_val_copy(group->shared_tile->bound[i].size);
-		bound = isl_val_sub_ui(bound, 1);
-		local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
-	}
-	local = isl_set_preimage_multi_aff(local,
-				isl_multi_aff_copy(group->shared_tile->tiling));
-	tile = isl_set_unwrap(local);
-	extent = array_extent(group->array);
-	tile = isl_map_intersect_range(tile, extent);
-
-	return tile;
-}
-
-/* Given a mapping "iterator_map" from the AST schedule to a domain,
- * return the corresponding mapping from the AST schedule to
- * to the outer kernel->copy_schedule_dim dimensions of
- * the schedule computed by PPCG for this kernel.
- *
- * Note that kernel->copy_schedule_dim is at least as large as
- * the largest depth of any array reference group associated to the kernel.
- * This is needed as the returned schedule is used to extract a mapping
- * to the outer tile->depth dimensions in transform_index.
- */
-static __isl_give isl_pw_multi_aff *compute_sched_to_copy(
-	struct ppcg_kernel *kernel, __isl_take isl_pw_multi_aff *iterator_map)
-{
-	isl_union_pw_multi_aff *upma;
-	isl_pw_multi_aff *pma;
-	isl_space *space;
-
-	space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
-	space = isl_space_from_domain(space);
-	space = isl_space_add_dims(space, isl_dim_out,
-					kernel->copy_schedule_dim);
-
-	upma = isl_union_pw_multi_aff_copy(kernel->copy_schedule);
-	pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space);
-	isl_union_pw_multi_aff_free(upma);
-
-	return isl_pw_multi_aff_pullback_pw_multi_aff(pma, iterator_map);
-}
-
-/* If max_shared_memory is not set to infinity (-1), then make
- * sure that the total amount of shared memory required by the
- * array reference groups mapped to shared memory by "kernel"
- * is no larger than this maximum.
- *
- * We apply a greedy approach and discard (keep in global memory)
- * those groups that would result in a total memory size that
- * is larger than the maximum.
- *
- * This function should be called after any function that may
- * affect the decision on whether to place a reference group
- * in private, shared or global memory.
- */
-static void check_shared_memory_bound(struct ppcg_kernel *kernel)
-{
-	int i, j;
-	isl_val *left, *size;
-
-	if (kernel->options->max_shared_memory < 0)
-		return;
-
-	left = isl_val_int_from_si(kernel->ctx,
-				    kernel->options->max_shared_memory);
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *local = &kernel->array[i];
-
-		for (j = 0; j < local->n_group; ++j) {
-			struct gpu_array_ref_group *group;
-			enum ppcg_group_access_type type;
-
-			group = local->groups[j];
-			type = gpu_array_ref_group_type(group);
-			if (type != ppcg_access_shared)
-				continue;
-
-			size = gpu_array_tile_size(group->shared_tile);
-			size = isl_val_mul_ui(size, local->array->size);
-
-			if (isl_val_le(size, left)) {
-				left = isl_val_sub(left, size);
-				continue;
-			}
-			isl_val_free(size);
-
-			group->shared_tile =
-					gpu_array_tile_free(group->shared_tile);
-		}
-	}
-
-	isl_val_free(left);
-}
-
-/* Mark all arrays of "kernel" that have an array reference group
- * that is not mapped to private or shared memory as
- * accessing the corresponding global device memory.
- */
-static void mark_global_arrays(struct ppcg_kernel *kernel)
-{
-	int i, j;
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *local = &kernel->array[i];
-
-		if (local->global)
-			continue;
-		for (j = 0; j < local->n_group; ++j) {
-			if (gpu_array_ref_group_tile(local->groups[j]))
-				continue;
-
-			local->global = 1;
-			local->array->global = 1;
-			break;
-		}
-	}
-}
-
-/* Compute a tiling for all the array reference groups in "kernel".
- */
-static void compute_group_tilings(struct ppcg_kernel *kernel)
-{
-	int i, j;
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *array = &kernel->array[i];
-
-		for (j = 0; j < array->n_group; ++j)
-			gpu_array_ref_group_compute_tiling(array->groups[j]);
-	}
-}
-
-/* Compute the effective grid size as a list of the sizes in each dimension.
- *
- * The grid size specified by the user or set by default
- * in read_grid_sizes() and applied by the block filter,
- * may be too large for the given code in the sense that
- * it may contain blocks that don't need to execute anything.
- * We therefore don't return this grid size, but instead the
- * smallest grid size that ensures that all blocks that actually
- * execute code are included in the grid.
- *
- * We first extract a description of the grid, i.e., the possible values
- * of the block ids, from the domain elements in "domain" and
- * kernel->block_filter.
- * The block ids are parameters in kernel->block_filter.
- * We simply need to change them into set dimensions.
- *
- * Then, for each block dimension, we compute the maximal value of the block id
- * and add one.
- */
-static __isl_give isl_multi_pw_aff *extract_grid_size(
-	struct ppcg_kernel *kernel, __isl_take isl_union_set *domain)
-{
-	int i;
-	isl_set *grid;
-	isl_set *context;
-	isl_multi_pw_aff *size;
-
-	domain = isl_union_set_intersect(domain,
-				    isl_union_set_copy(kernel->block_filter));
-	grid = isl_union_set_params(domain);
-	grid = isl_set_from_params(grid);
-	grid = isl_set_add_dims(grid, isl_dim_set, kernel->n_grid);
-	for (i = 0; i < kernel->n_grid; ++i) {
-		int pos;
-		isl_id *id;
-
-		id = isl_id_list_get_id(kernel->block_ids, i);
-		pos = isl_set_find_dim_by_id(grid, isl_dim_param, id);
-		isl_id_free(id);
-		assert(pos >= 0);
-		grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
-		grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
-	}
-
-	grid = isl_set_coalesce(grid);
-	size = ppcg_size_from_extent(grid);
-	context = isl_set_params(isl_set_copy(kernel->context));
-	return isl_multi_pw_aff_gist(size, context);
-}
-
-/* Compute the size of a fixed bounding box around the origin and "set",
- * where "set" is assumed to contain only non-negative elements,
- * and store the results in "size".
- * In particular, compute the maximal value of "set" in each direction
- * and add one.
- */
-static void extract_fixed_size(__isl_take isl_set *set, int *size)
-{
-	int i, n;
-	isl_local_space *ls;
-	isl_aff *obj;
-
-	n = isl_set_dim(set, isl_dim_set);
-	ls = isl_local_space_from_space(isl_set_get_space(set));
-	obj = isl_aff_zero_on_domain(ls);
-	for (i = 0; i < n; ++i) {
-		isl_val *max;
-
-		obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
-		max = isl_set_max_val(set, obj);
-		size[i] = isl_val_get_num_si(max) + 1;
-		isl_val_free(max);
-		obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
-	}
-	isl_aff_free(obj);
-	isl_set_free(set);
-}
-
-/* Compute the effective block size as a list of the sizes in each dimension
- * and store the sizes in kernel->block_dim.
- *
- * The block size specified by the user or set by default
- * in read_block_sizes() and applied by the thread filter,
- * may be too large for the given code in the sense that
- * it may contain threads that don't need to execute anything.
- * We therefore update this block size in kernel->block_dim
- * to the smallest block size that ensures that all threads
- * that actually execute code are included in the block.
- *
- * The set of possible values of the thread ids is obtained from
- * the domain elements "domain" and kernel->thread_filter.
- * The current implementation eliminates all parameters, ensuring
- * that the size is a fixed constant in each dimension.
- * In principle we could also compute parametric sizes.
- * We would have to make sure to project out all b%d and t%d parameters,
- * however.
- */
-static isl_stat extract_block_size(struct ppcg_kernel *kernel,
-	__isl_take isl_union_set *domain)
-{
-	int i;
-	int nparam;
-	isl_set *block;
-
-	domain = isl_union_set_intersect(domain,
-				    isl_union_set_copy(kernel->thread_filter));
-	block = isl_union_set_params(domain);
-	block = isl_set_from_params(block);
-	block = isl_set_add_dims(block, isl_dim_set, kernel->n_block);
-	for (i = 0; i < kernel->n_block; ++i) {
-		int pos;
-		isl_id *id;
-
-		if (!block)
-			return isl_stat_error;
-
-		id = isl_id_list_get_id(kernel->thread_ids, i);
-		pos = isl_set_find_dim_by_id(block, isl_dim_param, id);
-		isl_id_free(id);
-		if (pos < 0)
-			isl_die(isl_set_get_ctx(block), isl_error_internal,
-				"missing constraints on thread identifier",
-				block = isl_set_free(block));
-		block = isl_set_equate(block, isl_dim_param, pos,
-					isl_dim_set, i);
-	}
-	nparam = isl_set_dim(block, isl_dim_param);
-	block = isl_set_project_out(block, isl_dim_param, 0, nparam);
-
-	if (!block)
-		return isl_stat_error;
-
-	extract_fixed_size(block, kernel->block_dim);
-
-	return isl_stat_ok;
-}
-
-struct ppcg_kernel *ppcg_kernel_free(struct ppcg_kernel *kernel)
-{
-	int i, j;
-
-	if (!kernel)
-		return NULL;
-
-	isl_id_list_free(kernel->block_ids);
-	isl_id_list_free(kernel->thread_ids);
-	isl_multi_pw_aff_free(kernel->grid_size);
-	isl_ast_expr_free(kernel->grid_size_expr);
-	isl_set_free(kernel->context);
-	isl_union_set_free(kernel->core);
-	isl_union_set_free(kernel->arrays);
-	isl_union_pw_multi_aff_free(kernel->contraction);
-	isl_union_set_free(kernel->expanded_domain);
-	isl_space_free(kernel->space);
-	isl_ast_node_free(kernel->tree);
-	isl_union_set_free(kernel->block_filter);
-	isl_union_set_free(kernel->thread_filter);
-	isl_union_pw_multi_aff_free(kernel->copy_schedule);
-	isl_union_set_free(kernel->sync_writes);
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *array = &kernel->array[i];
-
-		for (j = 0; j < array->n_group; ++j)
-			gpu_array_ref_group_free(array->groups[j]);
-		free(array->groups);
-
-		isl_multi_pw_aff_free(array->bound);
-		isl_ast_expr_free(array->bound_expr);
-	}
-	free(kernel->array);
-
-	for (i = 0; i < kernel->n_var; ++i) {
-		free(kernel->var[i].name);
-		isl_vec_free(kernel->var[i].size);
-	}
-	free(kernel->var);
-
-	free(kernel);
-
-	return NULL;
-}
-
-/* Wrapper around ppcg_kernel_free for use as a isl_id_set_free_user callback.
- */
-static void ppcg_kernel_free_wrap(void *user)
-{
-	struct ppcg_kernel *kernel = user;
-
-	ppcg_kernel_free(kernel);
-}
-
-static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
-	struct ppcg_kernel_var *var)
-{
-	int j;
-	struct gpu_array_tile *tile;
-	isl_printer *p;
-
-	var->array = group->array;
-
-	var->type = gpu_array_ref_group_type(group);
-	tile = gpu_array_ref_group_tile(group);
-
-	p = isl_printer_to_str(ctx);
-	p = gpu_array_ref_group_print_name(group, p);
-	var->name = isl_printer_get_str(p);
-	isl_printer_free(p);
-
-	var->size = isl_vec_alloc(ctx, group->array->n_index);
-
-	for (j = 0; j < group->array->n_index; ++j)
-		var->size = isl_vec_set_element_val(var->size, j,
-					    isl_val_copy(tile->bound[j].size));
-}
-
-static int create_kernel_vars(struct ppcg_kernel *kernel)
-{
-	int i, j, n;
-
-	n = 0;
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *array = &kernel->array[i];
-
-		for (j = 0; j < array->n_group; ++j) {
-			struct gpu_array_ref_group *group = array->groups[j];
-			enum ppcg_group_access_type type;
-
-			type = gpu_array_ref_group_type(group);
-			if (type != ppcg_access_global)
-				++n;
-		}
-	}
-
-	kernel->n_var = n;
-	kernel->var = isl_calloc_array(kernel->ctx, struct ppcg_kernel_var, n);
-	if (!kernel->var)
-		return -1;
-
-	n = 0;
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *array = &kernel->array[i];
-
-		for (j = 0; j < array->n_group; ++j) {
-			struct gpu_array_ref_group *group = array->groups[j];
-			enum ppcg_group_access_type type;
-
-			type = gpu_array_ref_group_type(group);
-			if (type == ppcg_access_global)
-				continue;
-			create_kernel_var(kernel->ctx, group, &kernel->var[n]);
-			++n;
-		}
-	}
-
-	return 0;
-}
-
-/* Replace "pa" by the zero function defined over the universe domain
- * in the space of "pa".
- */
-static __isl_give isl_pw_aff *set_universally_zero(__isl_take isl_pw_aff *pa)
-{
-	isl_space *space;
-	isl_aff *zero;
-
-	space = isl_space_domain(isl_pw_aff_get_space(pa));
-	isl_pw_aff_free(pa);
-	zero = isl_aff_zero_on_domain(isl_local_space_from_space(space));
-
-	return isl_pw_aff_from_aff(zero);
-}
-
-/* The sizes of the arrays on the host that have been computed by
- * extract_array_info may depend on the parameters.  Use the extra
- * constraints on the parameters that are valid at "host_domain"
- * to simplify these expressions and store the results in kernel->array.
- *
- * We only need these localized bounds for arrays that are accessed
- * by the current kernel.  If we have found at least one reference group
- * then the array is accessed by the kernel.
- *
- * The resulting sizes may be functions that are nowhere defined
- * in case the access function cannot possibly access anything inside
- * the kernel for some reason.  If so, they are replaced by the zero
- * function.  Since the access function cannot actually access anything,
- * there is no harm in printing the array sizes as zero.
- */
-static void localize_bounds(struct ppcg_kernel *kernel,
-	__isl_keep isl_set *host_domain)
-{
-	int i, j;
-	isl_set *context;
-
-	context = isl_set_copy(host_domain);
-	context = isl_set_params(context);
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *local = &kernel->array[i];
-		isl_multi_pw_aff *bound;
-		int n_index;
-
-		if (local->n_group == 0)
-			continue;
-
-		n_index = local->array->n_index;
-		bound = isl_multi_pw_aff_copy(local->array->bound);
-
-		for (j = 0; j < n_index; ++j) {
-			isl_pw_aff *pwaff;
-			int empty;
-
-			pwaff = isl_multi_pw_aff_get_pw_aff(bound, j);
-			pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
-			empty = isl_pw_aff_is_empty(pwaff);
-			if (empty < 0)
-				pwaff = isl_pw_aff_free(pwaff);
-			else if (empty)
-				pwaff = set_universally_zero(pwaff);
-			bound = isl_multi_pw_aff_set_pw_aff(bound, j, pwaff);
-		}
-
-		local->n_index = n_index;
-		local->bound = bound;
-	}
-	isl_set_free(context);
-}
-
-/* Create the array of gpu_local_array_info structures "array"
- * inside "kernel".  The number of elements in this array is
- * the same as the number of arrays in "prog".
- * Initialize the "array" field of each local array to point
- * to the corresponding array in "prog".
- */
-static struct ppcg_kernel *ppcg_kernel_create_local_arrays(
-	struct ppcg_kernel *kernel, struct gpu_prog *prog)
-{
-	int i;
-	isl_ctx *ctx;
-
-	ctx = isl_set_get_ctx(prog->context);
-	kernel->array = isl_calloc_array(ctx,
-			    struct gpu_local_array_info, prog->n_array);
-	if (!kernel->array)
-		return ppcg_kernel_free(kernel);
-	kernel->n_array = prog->n_array;
-
-	for (i = 0; i < prog->n_array; ++i)
-		kernel->array[i].array = &prog->array[i];
-
-	return kernel;
-}
-
-/* Does "kernel" need to be passed an argument corresponding to array "i"?
- *
- * The argument is only needed if the kernel accesses this device memory.
- */
-int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i)
-{
-	return kernel->array[i].global;
-}
-
-/* Find the element in gen->stmt that has the given "id".
- * Return NULL if no such gpu_stmt can be found.
- */
-static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
-{
-	int i;
-
-	for (i = 0; i < prog->n_stmts; ++i) {
-		if (id == prog->stmts[i].id)
-			break;
-	}
-
-	return i < prog->n_stmts ? &prog->stmts[i] : NULL;
-}
-
-void ppcg_kernel_stmt_free(void *user)
-{
-	struct ppcg_kernel_stmt *stmt = user;
-
-	if (!stmt)
-		return;
-
-	switch (stmt->type) {
-	case ppcg_kernel_copy:
-		isl_ast_expr_free(stmt->u.c.index);
-		isl_ast_expr_free(stmt->u.c.local_index);
-		break;
-	case ppcg_kernel_domain:
-		isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
-		break;
-	case ppcg_kernel_sync:
-		break;
-	}
-
-	free(stmt);
-}
-
-/* Return the gpu_stmt_access in the list "accesses" that corresponds
- * to "ref_id".
- */
-static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
-	__isl_keep isl_id *ref_id)
-{
-	struct gpu_stmt_access *access;
-
-	for (access = accesses; access; access = access->next)
-		if (access->ref_id == ref_id)
-			return access;
-
-	return NULL;
-}
-
-/* Return the index of the array called "name" in the list of arrays.
- */
-static int find_array_index(struct ppcg_kernel *kernel, const char *name)
-{
-	int i;
-
-	for (i = 0; i < kernel->n_array; ++i)
-		if (!strcmp(name, kernel->array[i].array->name))
-			return i;
-
-	return -1;
-}
-
-/* Internal data structure for the index and AST expression transformation
- * callbacks for pet_stmt_build_ast_exprs.
- *
- * "kernel" is the kernel for which are computing AST expressions and
- * may be NULL if we are not inside a kernel.
- * "accesses" is the list of gpu_stmt_access in the statement.
- * "iterator_map" expresses the statement iterators in terms of
- * the AST loop iterators.
- * "sched2copy" expresses the outer copy_schedule_dim dimensions of
- * the kernel schedule in terms of the AST loop iterators and
- * may be NULL if we are not inside a kernel.
- *
- * The following fields are set in transform_index and used in transform_expr.
- * "array" is the array that is being accessed.
- * "global" is set if the global array is accessed (rather than
- * shared/private memory).
- * "local_array" refers to information on the array specialized
- * to the current kernel.
- */
-struct ppcg_transform_data {
-    struct ppcg_options *options;
-    struct ppcg_kernel *kernel;
-	struct gpu_stmt_access *accesses;
-	isl_pw_multi_aff *iterator_map;
-	isl_pw_multi_aff *sched2copy;
-
-	struct gpu_array_info *array;
-	int global;
-	struct gpu_local_array_info *local_array;
-};
-
-/* Return a pointer to the gpu_array_ref_group in "local"
- * that contains the reference "access".
- * Return NULL if no such group can be found.
- */
-static struct gpu_array_ref_group *find_ref_group(
-	struct gpu_local_array_info *local, struct gpu_stmt_access *access)
-{
-	int i, j;
-
-	for (i = 0; i < local->n_group; ++i) {
-		struct gpu_array_ref_group *group = local->groups[i];
-
-		for (j = 0; j < group->n_ref; ++j)
-			if (group->refs[j] == access)
-				return group;
-	}
-
-	return NULL;
-}
-
-/* Given an index expression "index" of the form
- *
- *	L -> F(A),
- *
- * with F(A) either A or some subfield of A and L the AST loop iterators,
- * and a tiling "tiling" of the form
- *
- *	[L -> A] -> T
- *
- * apply the tiling to the outer array in the index expression to obtain
- *
- *	L -> T(A)
- *
- * If F(A) is some subfield of A, then separate the member access
- * into the base index expression and the field index expression,
- * apply the tiling to the base index expression and combine the result
- * with the field index expression.
- *
- * If F(A) is A, then modify index to keep track of the iterators
- *
- *	L -> [L -> A]
- *
- * and combine the result with the tiling to obtain a tiled index expression
- * in terms of the AST loop iterators
- *
- *	L -> T
- */
-static __isl_give isl_multi_pw_aff *tile_outer(
-	__isl_take isl_multi_pw_aff *index, __isl_take isl_multi_pw_aff *tiling)
-{
-	isl_bool is_wrapping;
-	isl_space *space;
-	isl_multi_pw_aff *mpa;
-
-	is_wrapping = isl_multi_pw_aff_range_is_wrapping(index);
-	if (is_wrapping < 0)
-		goto error;
-	if (is_wrapping) {
-		isl_multi_pw_aff *field;
-
-		field = isl_multi_pw_aff_copy(index);
-		field = isl_multi_pw_aff_range_factor_range(field);
-		index = isl_multi_pw_aff_range_factor_domain(index);
-		index = tile_outer(index, tiling);
-		return isl_multi_pw_aff_range_product(index, field);
-	}
-
-	space = isl_space_domain(isl_multi_pw_aff_get_space(index));
-	space = isl_space_map_from_set(space);
-	mpa = isl_multi_pw_aff_identity(space);
-	index = isl_multi_pw_aff_range_product(mpa, index);
-	index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
-
-	return index;
-error:
-	isl_multi_pw_aff_free(index);
-	isl_multi_pw_aff_free(tiling);
-	return NULL;
-}
-
-/* Index transformation callback for pet_stmt_build_ast_exprs.
- *
- * "index" expresses the array indices in terms of statement iterators
- *
- * We first reformulate "index" in terms of the AST loop iterators.
- * Then we check if we are accessing the global array or
- * a shared/private copy.  In particular, if we are not inside a kernel
- * then we must be accessing a global array.
- * In the former case, we simply return
- * the updated index.  If "index" is an affine expression rather
- * than an array access, then we also return the updated index here.
- *
- * If no reference groups have been computed for the array,
- * then we can only be accessing the global array.
- *
- * Otherwise, we apply the tiling to the index.
- * This tiling is of the form
- *
- *	[D -> A] -> T
- *
- * where D corresponds to the outer tile->depth dimensions of
- * the kernel schedule.
- * The index is of the form
- *
- *	L -> A
- *
- * We update the tiling to refer to the AST loop iterators
- *
- *	[L -> A] -> T
- *
- * and combine it with the index to obtain a tiled index expression in terms
- * of the AST loop iterators
- *
- *	L -> T
- *
- * Note that while the tiling applies directly to an outer array.
- * the index may refer to some subfield of this outer array.
- * In such cases, the result will refer to the same subfield of the tile.
- * That is, an index expression of the form  L -> F(A) will be transformed
- * into an index expression of the form L -> F(T).
- */
-static __isl_give isl_multi_pw_aff *transform_index(
-	__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
-	void *user)
-{
-	struct ppcg_transform_data *data = user;
-	struct gpu_stmt_access *access;
-	struct gpu_array_ref_group *group;
-	struct gpu_array_tile *tile;
-	isl_pw_multi_aff *iterator_map;
-	int i;
-	int dim;
-	const char *name;
-	isl_space *space;
-	isl_multi_pw_aff *tiling;
-	isl_pw_multi_aff *pma;
-	isl_pw_multi_aff *sched2depth;
-
-	data->array = NULL;
-
-	iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
-	index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
-
-	if (!data->kernel)
-		return index;
-
-	access = find_access(data->accesses, ref_id);
-	if (!access)
-		return index;
-	if (!isl_map_has_tuple_name(access->access, isl_dim_out))
-		return index;
-
-	name = get_outer_array_name(access->access);
-	i = find_array_index(data->kernel, name);
-	if (i < 0)
-		isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
-			"cannot find array",
-			return isl_multi_pw_aff_free(index));
-	data->local_array = &data->kernel->array[i];
-	data->array = data->local_array->array;
-
-	group = find_ref_group(data->local_array, access);
-	if (!group) {
-		data->global = 1;
-		return index;
-	}
-
-	tile = gpu_array_ref_group_tile(group);
-	data->global = !tile;
-	if (!tile)
-		return index;
-
-	space = isl_space_domain(isl_multi_aff_get_space(tile->tiling));
-	space = isl_space_range(isl_space_unwrap(space));
-	space = isl_space_map_from_set(space);
-	pma = isl_pw_multi_aff_identity(space);
-	sched2depth = isl_pw_multi_aff_copy(data->sched2copy);
-	dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out);
-	sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out,
-					    tile->depth, dim - tile->depth);
-	pma = isl_pw_multi_aff_product(sched2depth, pma);
-	tiling = isl_multi_pw_aff_from_multi_aff(
-				    isl_multi_aff_copy(tile->tiling));
-	tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
-
-	index = tile_outer(index, tiling);
-
-	return index;
-}
-
-/* Dereference "expr" by adding an index [0].
- * The original "expr" is assumed not to have any indices.
- *
- * If "expr" is a member access, then the dereferencing needs
- * to be applied to the structure argument of this member access.
- */
-static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
-{
-	isl_ctx *ctx;
-	isl_ast_expr *arg0, *res;
-	isl_ast_expr_list *list;
-
-	arg0 = isl_ast_expr_get_op_arg(expr, 0);
-	if (!arg0)
-		return isl_ast_expr_free(expr);
-	if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
-	    isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
-		isl_ast_expr *arg;
-
-		arg = isl_ast_expr_get_op_arg(arg0, 0);
-		arg = dereference(arg);
-		arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
-		expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
-
-		return expr;
-	}
-	isl_ast_expr_free(arg0);
-
-	ctx = isl_ast_expr_get_ctx(expr);
-	res = isl_ast_expr_from_val(isl_val_zero(ctx));
-	list = isl_ast_expr_list_from_ast_expr(res);
-	res = isl_ast_expr_get_op_arg(expr, 0);
-	res = isl_ast_expr_access(res, list);
-	isl_ast_expr_free(expr);
-
-	return res;
-}
-
-/* Linearize the index expression "expr" based on the array bounds
- * of "array".
- *
- * That is, transform expression
- *
- *	A[i_0][i_1]...[i_n]
- *
- * to
- *
- *	A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
- *
- * where b_0, b_1, ..., b_n are the bounds on the array.
- *
- * If the base of "expr" is a member access, then the linearization needs
- * to be applied to the structure argument of this member access.
- *
- * In the base case, if "expr" has no arguments (other than the name of
- * the array), then we are passing an entire array to a function.
- * In this case, there is nothing to linearize.
- * Note that at this point an expression with no arguments can
- * only be an entire array because the scalar case and
- * the case of single struct are handled by the caller.
- *
- * If the number of specified index expressions in "expr"
- * is smaller than the dimension of the accessed array,
- * then the missing i_j also do not appear in the linearized expression.
- * Furthermore, since such an expression does not refer to a single
- * element while the default linearized expression would refer to
- * a single element, we return the expression
- *
- *	A + (..((i_0 * b_1 + i_1) ... ) * b_l + i_l)
- *
- * instead.  Note that because of the special case handling above,
- * we can assume here that there is at least one index expression.
- */
-__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
-	struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
-{
-	int i, n;
-	isl_ast_expr *arg0;
-	isl_ast_expr *res;
-	isl_ast_expr_list *list;
-
-	arg0 = isl_ast_expr_get_op_arg(expr, 0);
-	if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
-	    isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
-		isl_ast_expr *arg;
-
-		arg = isl_ast_expr_get_op_arg(arg0, 0);
-		arg = gpu_local_array_info_linearize_index(array, arg);
-		arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
-		expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
-
-		return expr;
-	}
-	isl_ast_expr_free(arg0);
-
-	if (isl_ast_expr_get_op_n_arg(expr) == 1)
-		return expr;
-
-	n = isl_ast_expr_get_op_n_arg(expr);
-	res = isl_ast_expr_get_op_arg(expr, 1);
-	for (i = 1; i < array->n_index; ++i) {
-		isl_ast_expr *expr_i;
-
-		expr_i = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
-		res = isl_ast_expr_mul(res, expr_i);
-
-		if (i + 1 >= n)
-			continue;
-		expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
-		res = isl_ast_expr_add(res, expr_i);
-	}
-
-	if (1 + array->n_index > n) {
-		res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res);
-	} else {
-		list = isl_ast_expr_list_from_ast_expr(res);
-		res = isl_ast_expr_get_op_arg(expr, 0);
-		res = isl_ast_expr_access(res, list);
-	}
-
-	isl_ast_expr_free(expr);
-
-	return res;
-}
-
-/* AST expression transformation callback for pet_stmt_build_ast_exprs.
- *
- * If the AST expression refers to an array that is not accessed
- * at all, then this means the value of the expression is not used,
- * so we might as well print zero (NULL pointer) instead.
- *
- * If the AST expression refers to a global scalar that is not
- * a read-only scalar, then its address was passed to the kernel and
- * we need to dereference it.
- *
- * If the AST expression refers to an access to a global array,
- * then we linearize the access exploiting the bounds in data->local_array.
- */
-static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
-	__isl_keep isl_id *id, void *user)
-{
-	struct ppcg_transform_data *data = user;
-
-	if (!data->array)
-		return expr;
-	if (!data->array->accessed) {
-		isl_ctx *ctx;
-
-		ctx = isl_ast_expr_get_ctx(expr);
-		isl_ast_expr_free(expr);
-		return isl_ast_expr_from_val(isl_val_zero(ctx));
-	}
-	if (gpu_array_is_read_only_scalar(data->array))
-		return expr;
-	if (!data->global)
-		return expr;
-	if (data->array->n_index == 0)
-		return dereference(expr);
-	if (!data->array->linearize)
-		return expr;
-
-	return gpu_local_array_info_linearize_index(data->local_array, expr);
-}
-
-/* This function is called for each instance of a user statement
- * in the kernel "kernel", identified by "gpu_stmt".
- * "kernel" may be NULL if we are not inside a kernel.
- *
- * We attach a struct ppcg_kernel_stmt to the "node", containing
- * a computed AST expression for each access, through an annotation
- * with name "user".
- * These AST expressions are computed from iterator_map,
- * which expresses the domain
- * elements in terms of the generated loops, and sched2copy,
- * which expresses the outer copy_schedule_dim dimensions of
- * the kernel schedule computed by PPCG in terms of the generated loops.
- */
-static __isl_give isl_ast_node *create_domain_leaf(
-	struct ppcg_kernel *kernel, __isl_take isl_ast_node *node,
-	__isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt,
-    struct gpu_gen *gen)
-{
-	struct ppcg_transform_data data;
-	struct ppcg_kernel_stmt *stmt;
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_pw_multi_aff *sched2copy;
-	isl_map *map;
-	isl_pw_multi_aff *iterator_map;
-	isl_union_map *schedule;
-
-	if (!node)
-		return NULL;
-	ctx = isl_ast_node_get_ctx(node);
-
-	stmt = isl_calloc_type(ctx, struct ppcg_kernel_stmt);
-	if (!stmt)
-		return isl_ast_node_free(node);
-
-	schedule = isl_ast_build_get_schedule(build);
-	map = isl_map_reverse(isl_map_from_union_map(schedule));
-	iterator_map = isl_pw_multi_aff_from_map(map);
-	if (kernel)
-		sched2copy = compute_sched_to_copy(kernel,
-					isl_pw_multi_aff_copy(iterator_map));
-	else
-		sched2copy = NULL;
-
-	stmt->type = ppcg_kernel_domain;
-	stmt->u.d.stmt = gpu_stmt;
-
-	data.kernel = kernel;
-	data.accesses = stmt->u.d.stmt->accesses;
-	data.iterator_map = iterator_map;
-	data.sched2copy = sched2copy;
-	stmt->u.d.ref2expr = gen->build_ast_expr(stmt->u.d.stmt->stmt,
-					    build, &transform_index, &data,
-					    &transform_expr, &data);
-
-	isl_pw_multi_aff_free(iterator_map);
-	isl_pw_multi_aff_free(sched2copy);
-
-	id = isl_id_alloc(ctx, "user", stmt);
-	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
-	return isl_ast_node_set_annotation(node, id);
-}
-
-/* This function is called for each statement node in the AST
- * for copying to or from shared/private memory.
- * Attach a pointer to a ppcg_kernel_stmt representing the copy
- * statement to the node.
- * The statement name is "read" or "write", depending on whether we are
- * reading from global memory or writing to global memory.
- *
- * The schedule is of the form
- *
- *	type[D -> A] -> L
- *
- * where D corresponds to the outer tile->depth dimensions of
- * the kernel schedule, A to the global array and L to the outer
- * generated AST schedule.
- * We compute the inverse and strip off the type, resulting in
- *
- *	L -> [D -> A]
- *
- * We combine this mapping with on the one hand the projection
- *
- *	[D -> A] -> A
- *
- * and on the other hand the group tiling
- *
- *	[D -> A] -> T
- *
- * resulting in
- *
- *	L -> A		and 	L -> T
- *
- * and store the corresponding expressions in stmt->index and stmt->local_index,
- * where stmt points to the ppcg_kernel_stmt that is attached to the node.
- * stmt->index is linearized if the global memory array is linearized.
- */
-static __isl_give isl_ast_node *create_access_leaf(struct ppcg_kernel *kernel,
-	struct gpu_array_ref_group *group, __isl_take isl_ast_node *node,
-	__isl_keep isl_ast_build *build)
-{
-	struct ppcg_kernel_stmt *stmt;
-	struct gpu_array_tile *tile;
-	isl_id *id;
-	isl_ast_expr *expr;
-	isl_space *space;
-	isl_map *access;
-	isl_pw_multi_aff *pma, *pma2;
-	const char *type;
-
-	stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt);
-	if (!stmt)
-		return isl_ast_node_free(node);
-
-	access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
-	type = isl_map_get_tuple_name(access, isl_dim_in);
-	stmt->u.c.read = !strcmp(type, "read");
-	access = isl_map_reverse(access);
-	pma = isl_pw_multi_aff_from_map(access);
-	pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out);
-
-	space = isl_space_range(isl_pw_multi_aff_get_space(pma));
-	space = isl_space_unwrap(space);
-	pma2 = isl_pw_multi_aff_range_map(space);
-	pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2,
-						    isl_pw_multi_aff_copy(pma));
-	expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
-	if (group->array->linearize)
-		expr = gpu_local_array_info_linearize_index(group->local_array,
-							    expr);
-	stmt->u.c.index = expr;
-
-	tile = gpu_array_ref_group_tile(group);
-	pma2 = isl_pw_multi_aff_from_multi_aff(
-					    isl_multi_aff_copy(tile->tiling));
-	pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, pma);
-	expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
-	stmt->u.c.local_index = expr;
-
-	stmt->u.c.array = group->array;
-	stmt->u.c.local_array = group->local_array;
-	stmt->type = ppcg_kernel_copy;
-
-	id = isl_id_alloc(kernel->ctx, "copy", stmt);
-	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
-	return isl_ast_node_set_annotation(node, id);
-}
-
-/* Create a synchronization ppcg_kernel_stmt and
- * attach it to the node "node" representing the synchronization.
- */
-static __isl_give isl_ast_node *create_sync_leaf(
-	struct ppcg_kernel *kernel, __isl_take isl_ast_node *node,
-	__isl_keep isl_ast_build *build)
-{
-	struct ppcg_kernel_stmt *stmt;
-	isl_id *id;
-
-	stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt);
-	if (!stmt)
-		return isl_ast_node_free(node);
-
-	stmt->type = ppcg_kernel_sync;
-	id = isl_id_alloc(kernel->ctx, "sync", stmt);
-	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
-	return isl_ast_node_set_annotation(node, id);
-}
-
-/* Build AST expressions for the device array sizes of all arrays in "prog"
- * that require allocation on the device using "build", as well as
- * for the original array sizes of all arrays that need to be declared
- * on the host.
- * "node" is freed in case of error.
- */
-static __isl_give isl_ast_node *build_array_bounds(
-	__isl_take isl_ast_node *node, struct gpu_prog *prog,
-	__isl_keep isl_ast_build *build)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-		isl_multi_pw_aff *size;
-		isl_ast_expr *expr;
-
-		if (!gpu_array_requires_device_allocation(array))
-			continue;
-
-		size = isl_multi_pw_aff_copy(array->bound);
-		expr = ppcg_build_size_expr(size, build);
-		array->bound_expr = expr;
-		if (!expr)
-			return isl_ast_node_free(node);
-	}
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-		isl_set *extent;
-		isl_multi_pw_aff *size;
-		isl_ast_expr *expr;
-
-		if (!array->declare_local)
-			continue;
-		extent = isl_set_copy(array->declared_extent);
-		size = ppcg_size_from_extent(extent);
-		expr = ppcg_build_size_expr(size, build);
-		array->declared_size = expr;
-		if (!expr)
-			return isl_ast_node_free(node);
-	}
-
-	return node;
-}
-
-/* Internal data structure for at_domain.
- *
- * "prog" represents the entire scop.
- * "kernel" points to the kernel to which the current schedule node
- * belongs.  It is set by before_mark and reset by after_mark.
- * It may be NULL if we are outside any kernel.
- */
-struct ppcg_at_domain_data {
-    struct gpu_prog *prog;
-    struct gpu_gen *gen;
-    struct ppcg_kernel *kernel;
-};
-
-/* This function is called for each instance of a user statement
- * in the kernel.  This may be one of the original user statements
- * or a statement introduced by PPCG.
- *
- * We first check if the statement id corresponds to a gpu statement,
- * which indicates the statement is an original user statement. Any statement
- * that is not an original user statement has been introduced by PPCG and
- * requires special handling.
- *
- * If the user statement is one of the original user statements, then we call
- * create_domain_leaf.  If it is "init_device", then we call
- * build_array_bounds.  Otherwise, we check if it is a copy or synchronization
- * statement and call the appropriate functions.  Statements that copy an array
- * to/from the device do not need any further treatment.
- * Neither does "clear_device".
- */
-static __isl_give isl_ast_node *at_domain(__isl_take isl_ast_node *node,
-	__isl_keep isl_ast_build *build, void *user)
-{
-	struct ppcg_at_domain_data *data = user;
-	struct gpu_stmt *gpu_stmt;
-	isl_ast_expr *expr, *arg;
-	isl_id *id;
-	int is_sync;
-	const char *name;
-	void *p;
-
-	expr = isl_ast_node_user_get_expr(node);
-	arg = isl_ast_expr_get_op_arg(expr, 0);
-	id = isl_ast_expr_get_id(arg);
-	name = isl_id_get_name(id);
-	p = isl_id_get_user(id);
-	isl_ast_expr_free(expr);
-	isl_ast_expr_free(arg);
-
-	gpu_stmt = find_stmt(data->prog, id);
-	is_sync = gpu_tree_id_is_sync(id, data->kernel);
-	isl_id_free(id);
-
-	if (gpu_stmt)
-		return create_domain_leaf(data->kernel, node, build, gpu_stmt,
-                                  data->gen);
-
-	if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_"))
-		return node;
-	if (!strcmp(name, "init_device"))
-		return build_array_bounds(node, data->prog, build);
-	if (!strcmp(name, "clear_device"))
-		return node;
-	if (is_sync < 0)
-		return isl_ast_node_free(node);
-	if (!strcmp(name, "read") || !strcmp(name, "write")) {
-		struct gpu_array_ref_group *group = p;
-		return create_access_leaf(data->kernel, group, node, build);
-	}
-	if (!is_sync)
-		isl_die(data->prog->ctx, isl_error_internal,
-			"unknown statement type",
-			return isl_ast_node_free(node));
-	return create_sync_leaf(data->kernel, node, build);
-}
-
-/* Given a set of wrapped references "ref", return the corresponding
- * access relations based on the tagged access relations "tagged".
- *
- * The elements of "ref" are of the form
- *
- *	[D -> R]
- *
- * with D an iteration domains and R a reference.
- * The elements of "tagged" are of the form
- *
- *	[D -> R] -> A
- *
- * with A an array.
- *
- * Extend "tagged" to include the iteration domain in the range, i.e.,
- *
- *	[D -> R] -> [D -> A]
- *
- * apply the result to "ref" and then unwrap the resulting set
- * to obtain relations of the form
- *
- *	D -> A
- */
-static __isl_give isl_union_map *wrapped_reference_to_access(
-	__isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
-{
-	isl_union_map *tag2access;
-
-	tag2access = isl_union_map_copy(tagged);
-	tag2access = isl_union_map_universe(tag2access);
-	tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));
-	tag2access = isl_union_map_domain_map(tag2access);
-	tag2access = isl_union_map_range_product(tag2access, tagged);
-
-	ref = isl_union_set_coalesce(ref);
-	ref = isl_union_set_apply(ref, tag2access);
-
-	return isl_union_set_unwrap(ref);
-}
-
-/* Given an access relation "access" from one or more array reference groups,
- * remove those reads if ("read" is 1) or writes (if "read" is 0)
- * that are only needed to communicate data within
- * the same iteration of "sched".
- * The domain of "sched" corresponds to the original statement instances,
- * i.e., those that appear in the domains of the access relations.
- * "tagged" contains all tagged access relations to all
- * the array reference groups accessed by "access" from statement
- * instances scheduled by "sched".
- *
- * If the access is a read then it is either an element of
- *
- *	live_in union (range flow)
- *
- * where live_in and flow may be overapproximations, or
- * it reads an uninitialized value (that is not live-in because
- * there is an intermediate kill) or it reads a value that was
- * written within the same (compound) statement instance.
- * If the access is a write then it is either an element of
- *
- *	live_out union (domain flow)
- *
- * or it writes a value that is never read (and is not live-out
- * because of an intermediate kill) or only
- * within the same (compound) statement instance.
- * In both cases, the access relation is also a subset of
- * the group access relation.
- *
- * The cases where an uninitialized value is read or a value is written
- * that is never read or where the dataflow occurs within a statement
- * instance are also considered local and may also be removed.
- *
- * Essentially, we compute the intersection of "access" with either
- *
- *	live_in union (range non-local-flow)
- *
- * or
- *
- *	live_out union (domain non-local-flow)
- *
- * We first construct a relation "local"
- *
- *	[[D -> R] -> [D' -> R']]
- *
- * of pairs of domain iterations accessing the reference group
- * and references in the group that are coscheduled by "sched".
- *
- * If this relation does not intersect the dataflow dependences,
- * then there is nothing we can possibly remove, unless the dataflow
- * dependences themselves only relate a subset of the accesses.
- * In particular, the accesses may not be involved in any dataflow
- * dependences, either because they are uninitialized reads/dead writes
- * or because the dataflow occurs inside a statement instance.
- *
- * Since the computation below may break up the access relation
- * into smaller pieces, we only perform the intersection with
- * the non-local dependent accesses if the local pairs
- * intersect the dataflow dependences.  Otherwise, we intersect
- * with the universe of the non-local dependent accesses.
- * This should at least remove accesses from statements that
- * do not participate in any dependences.
- *
- * In particular, we remove the "local" dataflow dependences from
- * the set of all dataflow dependences, or at least those
- * that may contribute to a domain/range that intersects
- * the domain of "access".
- * Note that if the potential dataflow dependences are an overapproximation
- * of the actual dataflow dependences, then the result remains an
- * overapproximation of the non-local dataflow dependences.
- * Copying to/from global memory is only needed for the references
- * in the domain/range of the result or for accesses that are live out/in
- * for the entire scop.
- *
- * We therefore map the domain/range of the "external" relation
- * to the corresponding access relation and take the union with
- * the live out/in relation.
- */
-static __isl_give isl_union_map *remove_local_accesses(
-	struct gpu_prog *prog, __isl_take isl_union_map *tagged,
-	__isl_take isl_union_map *access, __isl_take isl_union_map *sched,
-	int read)
-{
-	int empty;
-	isl_union_pw_multi_aff *tagger;
-	isl_union_set *domain, *access_domain;
-	isl_union_map *local, *external, *universe;
-	isl_union_set *tag_set;
-
-	if (isl_union_map_is_empty(access)) {
-		isl_union_map_free(sched);
-		isl_union_map_free(tagged);
-		return access;
-	}
-
-	tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger);
-	domain = isl_union_map_domain(isl_union_map_copy(tagged));
-	tagger = isl_union_pw_multi_aff_intersect_domain(tagger,
-					isl_union_set_copy(domain));
-	sched = isl_union_map_preimage_domain_union_pw_multi_aff(sched, tagger);
-
-	local = isl_union_map_apply_range(sched,
-			    isl_union_map_reverse(isl_union_map_copy(sched)));
-	local = isl_union_map_intersect(local,
-			isl_union_map_copy(prog->scop->tagged_dep_flow));
-
-	empty = isl_union_map_is_empty(local);
-
-	external = isl_union_map_copy(prog->scop->tagged_dep_flow);
-	universe = isl_union_map_universe(isl_union_map_copy(access));
-	access_domain = isl_union_map_domain(universe);
-	domain = isl_union_set_universe(domain);
-	universe = isl_union_set_unwrap(domain);
-	universe = isl_union_map_intersect_domain(universe, access_domain);
-	domain = isl_union_map_wrap(universe);
-	if (read)
-		external = isl_union_map_intersect_range(external, domain);
-	else
-		external = isl_union_map_intersect_domain(external, domain);
-	external = isl_union_map_intersect_params(external,
-				isl_set_copy(prog->scop->context));
-	external = isl_union_map_subtract(external, local);
-
-	if (read) {
-		tag_set = isl_union_map_range(external);
-		external = wrapped_reference_to_access(tag_set, tagged);
-		external = isl_union_map_union(external,
-				isl_union_map_copy(prog->scop->live_in));
-	} else {
-		tag_set = isl_union_map_domain(external);
-		external = wrapped_reference_to_access(tag_set, tagged);
-		external = isl_union_map_union(external,
-				isl_union_map_copy(prog->scop->live_out));
-	}
-
-	if (empty < 0)
-		external = isl_union_map_free(external);
-	else if (empty)
-		external = isl_union_map_universe(external);
-
-	access = isl_union_map_intersect(access, external);
-
-	return access;
-}
-
-/* Given an access relation "access" from "group", remove those reads
- * if ("read" is 1) or writes (if "read" is 0) that are only needed to
- * communicate data within the same iteration of the schedule "prefix"
- * at the position where the copying of the group is inserted.
- * That is, the output dimension of "prefix"
- * is equal to tile->depth.
- * The domain of "prefix" corresponds to the original statement instances,
- * i.e., those that appear in the domains of the access relations.
- *
- * Extract the tagged access relation of "group" and
- * then call remove_local_accesses.
- */
-static __isl_give isl_union_map *remove_local_accesses_group(
-	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
-	__isl_take isl_union_map *access, __isl_keep isl_union_map *prefix,
-	int read)
-{
-	isl_union_map *sched, *tagged;
-
-	if (isl_union_map_is_empty(access))
-		return access;
-
-	tagged = group_tagged_access_relation(group);
-	sched = isl_union_map_copy(prefix);
-
-	return remove_local_accesses(kernel->prog, tagged, access, sched, read);
-}
-
-/* Build an access AST expression for the effective grid size using "build".
- * Store the result in kernel->grid_size_expr.
- */
-static isl_stat build_grid_size(struct ppcg_kernel *kernel,
-	__isl_keep isl_ast_build *build)
-{
-	isl_multi_pw_aff *size;
-
-	size = isl_multi_pw_aff_copy(kernel->grid_size);
-	size = isl_multi_pw_aff_set_tuple_name(size, isl_dim_out, "grid");
-	kernel->grid_size_expr = ppcg_build_size_expr(size, build);
-
-	if (!kernel->grid_size_expr)
-		return isl_stat_error;
-	return isl_stat_ok;
-}
-
-/* Build access AST expressions for the localized array sizes using "build".
- * Store the result in local->bound_expr.
- * Only do this for arrays for which localized bounds have been computed.
- */
-static isl_stat build_local_array_sizes(struct ppcg_kernel *kernel,
-	__isl_keep isl_ast_build *build)
-{
-	int i;
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *local = &kernel->array[i];
-		isl_multi_pw_aff *size;
-
-		if (local->n_group == 0)
-			continue;
-		size = isl_multi_pw_aff_copy(local->bound);
-		local->bound_expr = ppcg_build_size_expr(size, build);
-		if (!local->bound_expr)
-			return isl_stat_error;
-	}
-
-	return isl_stat_ok;
-}
-
-/* Build access AST expressions for the effective grid size and
- * the localized array sizes using "build".
- */
-static isl_stat build_grid_and_local_array_sizes(struct ppcg_kernel *kernel,
-	__isl_keep isl_ast_build *build)
-{
-	if (build_grid_size(kernel, build) < 0)
-		return isl_stat_error;
-	if (build_local_array_sizes(kernel, build) < 0)
-		return isl_stat_error;
-	return isl_stat_ok;
-}
-
-/* This function is called before the AST generator starts traversing
- * the schedule subtree of a node with mark "mark".
- *
- * If the mark is called "kernel", store the kernel pointer in data->kernel
- * for use in at_domain and build AST expressions for the grid size and
- * the localized array sizes.
- */
-static isl_stat before_mark(__isl_keep isl_id *mark,
-	__isl_keep isl_ast_build *build, void *user)
-{
-	struct ppcg_at_domain_data *data = user;
-
-	if (!mark)
-		return isl_stat_error;
-	if (!strcmp(isl_id_get_name(mark), "kernel")) {
-		data->kernel = isl_id_get_user(mark);
-		if (build_grid_and_local_array_sizes(data->kernel, build) < 0)
-			return isl_stat_error;
-	}
-	return isl_stat_ok;
-}
-
-/* This function is called after the AST generator has finished traversing
- * the schedule subtree of a mark node.  "node" points to the corresponding
- * mark AST node.
- *
- * If the mark is called "kernel", then replace "node" by a user node
- * that "calls" the kernel, representing the launch of the kernel.
- * The original "node" is stored inside the kernel object so that
- * it can be used to print the device code.
- * Note that this assumes that a kernel is only launched once.
- * Also clear data->kernel.
- */
-static __isl_give isl_ast_node *after_mark(__isl_take isl_ast_node *node,
-        __isl_keep isl_ast_build *build, void *user)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_ast_expr *expr;
-	isl_ast_expr_list *list;
-	struct ppcg_kernel *kernel;
-	struct ppcg_at_domain_data *data = user;
-
-	ctx = isl_ast_node_get_ctx(node);
-	id = isl_ast_node_mark_get_id(node);
-	if (!id)
-		return isl_ast_node_free(node);
-	if (strcmp(isl_id_get_name(id), "kernel") || !data->kernel) {
-		isl_id_free(id);
-		return node;
-	}
-	kernel = data->kernel;
-	data->kernel = NULL;
-	kernel->space = isl_ast_build_get_schedule_space(build);
-	kernel->tree = isl_ast_node_mark_get_node(node);
-	isl_ast_node_free(node);
-
-	expr = isl_ast_expr_from_id(isl_id_copy(id));
-	list = isl_ast_expr_list_alloc(ctx, 0);
-	expr = isl_ast_expr_call(expr, list);
-	node = isl_ast_node_alloc_user(expr);
-	node = isl_ast_node_set_annotation(node, id);
-
-	return node;
-}
-
-static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
-{
-	int *depth = user;
-	int node_depth;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
-		return isl_bool_true;
-	node_depth = isl_schedule_node_get_schedule_depth(node);
-	if (node_depth > *depth)
-		*depth = node_depth;
-
-	return isl_bool_false;
-}
-
-/* Use isl to generate code for both the host and the device
- * from "schedule".
- * The device code is marked by "kernel" mark nodes in the schedule tree,
- * containing a pointer to a ppcg_kernel object.
- * The returned AST only contains the AST for the host code.
- * The ASTs for the device code are embedded in ppcg_kernel objects
- * attached to the leaf nodes that call "kernel".
- */
-__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
-	__isl_take isl_schedule *schedule)
-{
-	struct ppcg_at_domain_data data;
-	isl_ast_build *build;
-	isl_ast_node *tree;
-	isl_id_list *iterators;
-	int depth;
-
-    data.prog = gen->prog;
-    data.gen = gen;
-	data.kernel = NULL;
-
-	depth = 0;
-	if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
-						&depth) < 0)
-		return NULL;
-	build = isl_ast_build_alloc(gen->prog->ctx);
-	iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c");
-	build = isl_ast_build_set_iterators(build, iterators);
-	build = isl_ast_build_set_at_each_domain(build, &at_domain, &data);
-	build = isl_ast_build_set_before_each_mark(build, &before_mark, &data);
-	build = isl_ast_build_set_after_each_mark(build, &after_mark, &data);
-	if (gen->prog->scop->options->debug->dump_final_schedule)
-		isl_schedule_dump(schedule);
-	tree = isl_ast_build_node_from_schedule(build, schedule);
-	isl_ast_build_free(build);
-
-	return tree;
-}
-
-__isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
-{
-	if (!str)
-		return NULL;
-	return isl_union_map_read_from_str(ctx, str);
-}
-
-/* Can "node" be tiled and then mapped to block and thread identifiers?
- * That is, is it permutable with at least one coincident dimension?
- */
-static int is_permutable(__isl_keep isl_schedule_node *node)
-{
-	if (!node)
-		return -1;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		return 0;
-	if (!isl_schedule_node_band_get_permutable(node))
-		return 0;
-	if (isl_schedule_node_band_n_member(node) < 1)
-		return 0;
-	if (!isl_schedule_node_band_member_get_coincident(node, 0))
-		return 0;
-
-	return 1;
-}
-
-/* A isl_schedule_foreach_schedule_node_top_down callback
- * for setting *any_permutable and aborting the search
- * if "node" is a permutable band with coincident dimensions.
- * Otherwise, continue searching.
- */
-static isl_bool set_permutable(__isl_keep isl_schedule_node *node, void *user)
-{
-	int *any_permutable = user;
-	int permutable;
-
-	permutable = is_permutable(node);
-	if (permutable < 0)
-		return isl_bool_error;
-	if (!permutable)
-		return isl_bool_true;
-
-	*any_permutable = 1;
-
-	return isl_bool_error;
-}
-
-/* Does the subtree rooted at "node" have any suitably permutable band nodes?
- * That is, does it have any nodes that are permutable and that
- * have a least one coincident dimension?
- */
-static int subtree_has_permutable_bands(__isl_keep isl_schedule_node *node)
-{
-	int any_parallelism = 0;
-
-	if (isl_schedule_node_foreach_descendant_top_down(node, &set_permutable,
-						&any_parallelism) < 0 &&
-	    !any_parallelism)
-		return -1;
-
-	return any_parallelism;
-}
-
-/* Does "schedule" contain any permutable band with at least one coincident
- * member?
- */
-int has_any_permutable_node(__isl_keep isl_schedule *schedule)
-{
-	isl_schedule_node *root;
-	int any_permutable;
-
-	root = isl_schedule_get_root(schedule);
-	any_permutable = subtree_has_permutable_bands(root);
-	isl_schedule_node_free(root);
-
-	return any_permutable;
-}
-
-/* Is "node" a candidate for mapping to block and thread identifiers?
- * In particular, is it permutable with at least one coincident dimension?
- * Alternatively, does the subtree rooted at "node" not contain
- * any such permutable node?  Filter nodes are skipped in this case,
- * because a band node will be inserted in front of the returned
- * node and this is not possible for filter nodes that are children
- * of set or sequence nodes.
- */
-static int is_candidate(__isl_keep isl_schedule_node *node)
-{
-	int permutable;
-
-	if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf)
-		return 1;
-	permutable = is_permutable(node);
-	if (permutable < 0 || permutable)
-		return permutable;
-	if (isl_schedule_node_get_type(node) == isl_schedule_node_filter)
-		return 0;
-	permutable = subtree_has_permutable_bands(node);
-	if (permutable < 0)
-		return -1;
-	return !permutable;
-}
-
-/* Is "node" the outermost node in its branch that can be tiled
- * and then mapped to block and thread identifiers?
- * If there are no such nodes in the subtree at "node" and
- * if "node" is not a filter node, then it is accepted too.
- */
-static int is_outer_tilable(__isl_keep isl_schedule_node *node)
-{
-	int tilable;
-	isl_schedule_node *ancestor;
-
-	tilable = is_candidate(node);
-	if (tilable < 0)
-		return -1;
-	if (!tilable)
-		return 0;
-
-	tilable = 0;
-	ancestor = isl_schedule_node_copy(node);
-	while (isl_schedule_node_has_parent(ancestor)) {
-		ancestor = isl_schedule_node_parent(ancestor);
-
-		tilable = is_candidate(ancestor);
-		if (tilable < 0 || tilable)
-			break;
-	}
-
-	isl_schedule_node_free(ancestor);
-	return tilable < 0 ? -1 : !tilable;
-}
-
-/* Collect the references to all writes in "group".
- * Each reference is represented by a universe set in a space
- *
- *	[S[i,j] -> R[]]
- *
- * with S[i,j] the statement instance space and R[] the array reference.
- */
-static __isl_give isl_union_set *group_tagged_writes(
-	struct gpu_array_ref_group *group)
-{
-	int i;
-	isl_space *space;
-	isl_union_set *writes;
-
-	space = isl_map_get_space(group->access);
-	writes = isl_union_set_empty(space);
-	for (i = 0; i < group->n_ref; ++i) {
-		isl_space *space;
-		isl_set *writes_i;
-
-		if (!group->refs[i]->write)
-			continue;
-
-		space = isl_map_get_space(group->refs[i]->tagged_access);
-		space = isl_space_domain(space);
-		writes_i = isl_set_universe(space);
-		writes = isl_union_set_add_set(writes, writes_i);
-	}
-
-	return writes;
-}
-
-/* Is there any write access in "group" that requires synchronization
- * on a write to global memory?
- * We currently take into account all writes that would require
- * synchronization at the thread level depth, but if the copying
- * for this group is performed at an outer level, then we do not
- * actually need to take into account dependences at intermediate levels.
- */
-static int any_sync_writes_in_group(struct ppcg_kernel *kernel,
-	struct gpu_array_ref_group *group)
-{
-	isl_union_set *writes;
-	int empty, disjoint;
-
-	empty = isl_union_set_is_empty(kernel->sync_writes);
-	if (empty < 0)
-		return -1;
-	if (empty)
-		return 0;
-
-	writes = group_tagged_writes(group);
-	disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes);
-	isl_union_set_free(writes);
-
-	return disjoint < 0 ? -1 : !disjoint;
-}
-
-/* Collect the references to all writes in "kernel" that write directly
- * to global or shared memory, i.e., that are not mapped to private memory.
- * Each reference is represented by a universe set in a space
- *
- *	[S[i,j] -> R[]]
- *
- * with S[i,j] the statement instance space and R[] the array reference.
- */
-static __isl_give isl_union_set *collect_non_private_tagged_writes(
-	struct ppcg_kernel *kernel)
-{
-	isl_union_set *writes;
-	int i, j;
-
-	writes = isl_union_set_empty(isl_union_set_get_space(kernel->arrays));
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *array = &kernel->array[i];
-
-		for (j = 0; j < array->n_group; ++j) {
-			struct gpu_array_ref_group *group = array->groups[j];
-			enum ppcg_group_access_type type;
-			isl_union_set *writes_ij;
-
-			if (!group->write)
-				continue;
-			type = gpu_array_ref_group_type(group);
-			if (type == ppcg_access_private)
-				continue;
-			writes_ij = group_tagged_writes(group);
-			writes = isl_union_set_union(writes, writes_ij);
-		}
-	}
-
-	return writes;
-}
-
-/* Are there any direct writes to global memory that require
- * synchronization?
- */
-static int any_global_or_shared_sync_writes(struct ppcg_kernel *kernel)
-{
-	isl_union_set *writes;
-	int empty, disjoint;
-
-	empty = isl_union_set_is_empty(kernel->sync_writes);
-	if (empty < 0)
-		return -1;
-	if (empty)
-		return 0;
-
-	writes = collect_non_private_tagged_writes(kernel);
-	disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes);
-	isl_union_set_free(writes);
-
-	return disjoint < 0 ? -1 : !disjoint;
-}
-
-/* Construct an isl_multi_val for use as tile sizes for tiling "node"
- * from the elements in "tile_size".
- */
-static __isl_give isl_multi_val *construct_band_tiles_sizes(
-	__isl_keep isl_schedule_node *node, int *tile_size)
-{
-	isl_space *space;
-
-	if (!node)
-		return NULL;
-
-	space = isl_schedule_node_band_get_space(node);
-	return ppcg_multi_val_from_int_list(space, tile_size);
-}
-
-/* Replace the partial schedule S of the band node "node" by
- *
- *	floor(S/f)
- *
- * or
- *
- *	f * floor(S/f)
- *
- * if scale_tile_loops is set, with f the integers in "factor".
- * The list that "factor" points to is assumed to contain at least
- * as many elements as the number of members in the band.
- */
-static __isl_give isl_schedule_node *snap_band_to_sizes(
-	__isl_take isl_schedule_node *node, int *factor,
-	struct ppcg_options *options)
-{
-	isl_multi_val *mv;
-
-	mv = construct_band_tiles_sizes(node, factor);
-	node = isl_schedule_node_band_scale_down(node, isl_multi_val_copy(mv));
-	if (options->scale_tile_loops)
-		node = isl_schedule_node_band_scale(node,
-							isl_multi_val_copy(mv));
-	isl_multi_val_free(mv);
-
-	return node;
-}
-
-/* Tile "band" with tile size specified by "sizes".
- *
- * Since the tile loops will be mapped to block ids, we forcibly
- * turn off tile loop scaling.  We may want to enable tile loop scaling
- * at some later point, but then we would have to support the detection
- * of strides during the mapping to block ids.
- * Similarly, since the point loops will be mapped to thread ids,
- * we forcibly shift the point loops so that they start at zero.
- */
-static __isl_give isl_schedule_node *tile_band(
-	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes)
-{
-	isl_ctx *ctx = isl_schedule_node_get_ctx(node);
-	int scale_tile;
-	int shift_point;
-
-	scale_tile = isl_options_get_tile_scale_tile_loops(ctx);
-	isl_options_set_tile_scale_tile_loops(ctx, 0);
-	shift_point = isl_options_get_tile_shift_point_loops(ctx);
-	isl_options_set_tile_shift_point_loops(ctx, 1);
-
-	node = isl_schedule_node_band_tile(node, sizes);
-
-	isl_options_set_tile_scale_tile_loops(ctx, scale_tile);
-	isl_options_set_tile_shift_point_loops(ctx, shift_point);
-
-	return node;
-}
-
-/* Extract the set of parameter values and outer schedule dimensions
- * for which any statement instance
- * in the kernel inserted at "node" needs to be executed.
- * Intersect the set of parameter values derived from the host schedule
- * relation with the context of "prog".
- */
-static __isl_give isl_set *extract_context(__isl_keep isl_schedule_node *node,
-	struct gpu_prog *prog)
-{
-	isl_union_map *schedule;
-	isl_union_set *schedule_domain;
-	isl_set *context;
-	int empty;
-
-	schedule = isl_schedule_node_get_prefix_schedule_relation(node);
-	schedule_domain = isl_union_map_range(schedule);
-	empty = isl_union_set_is_empty(schedule_domain);
-	if (empty < 0) {
-		isl_union_set_free(schedule_domain);
-		return NULL;
-	}
-	if (empty) {
-		int depth;
-		isl_space *space;
-
-		space = isl_union_set_get_space(schedule_domain);
-		isl_union_set_free(schedule_domain);
-		space = isl_space_set_from_params(space);
-		depth = isl_schedule_node_get_schedule_depth(node);
-		space = isl_space_add_dims(space, isl_dim_set, depth);
-		context = isl_set_empty(space);
-	} else {
-		context = isl_set_from_union_set(schedule_domain);
-	}
-	context = isl_set_intersect_params(context,
-					    isl_set_copy(prog->context));
-
-	return context;
-}
-
-/* Return the set of outer array elements accessed by
- * by the statement instances in "domain" in "prog".
- * The instances in "domain" are those that appear
- * in the domains of the access relations in "prog".
- */
-static __isl_give isl_union_set *accessed_by_domain(
-	__isl_take isl_union_set *domain, struct gpu_prog *prog)
-{
-	isl_union_map *access;
-	isl_union_set *arrays;
-
-	access = isl_union_map_union(isl_union_map_copy(prog->read),
-				     isl_union_map_copy(prog->may_write));
-	access = isl_union_map_intersect_domain(access, domain);
-	arrays = isl_union_map_range(access);
-	arrays = isl_union_set_apply(arrays,
-				isl_union_map_copy(prog->to_outer));
-
-	return arrays;
-}
-
-/* Return the number of outer band members of the band node "node"
- * that are marked coincident.
- */
-static int n_outer_coincidence(__isl_keep isl_schedule_node *node)
-{
-	int i, n;
-
-	n = isl_schedule_node_band_n_member(node);
-
-	for (i = 0; i < n; ++i)
-		if (!isl_schedule_node_band_member_get_coincident(node, i))
-			break;
-
-	return i;
-}
-
-/* If the band node "node" has more than "n" members, then split off
- * the first "n" of them.
- */
-static __isl_give isl_schedule_node *split_band(
-	__isl_take isl_schedule_node *node, int n)
-{
-	int dim;
-
-	dim = isl_schedule_node_band_n_member(node);
-	if (n < dim)
-		node = isl_schedule_node_band_split(node, n);
-
-	return node;
-}
-
-/* Scale a band node that may have been split by split_band.
- * "sizes" are the scaling factors for the original node.
- * "node" either points to the original band node, or the outer
- * of the two pieces after splitting.
- *
- * If the number of elements in "node" is smaller than the number of
- * elements in "sizes", then some splitting has occurred and we split
- * "sizes" in the same way.
- */
-static __isl_give isl_schedule_node *scale_band(
-	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes)
-{
-	int n, dim;
-
-	n = isl_multi_val_dim(sizes, isl_dim_set);
-	dim = isl_schedule_node_band_n_member(node);
-	if (n > dim) {
-		isl_multi_val *sizes2;
-
-		sizes2 = isl_multi_val_copy(sizes);
-		sizes = isl_multi_val_drop_dims(sizes,
-						isl_dim_set, dim, n - dim);
-		sizes2 = isl_multi_val_drop_dims(sizes2, isl_dim_set, 0, dim);
-		node = isl_schedule_node_child(node, 0);
-		node = isl_schedule_node_band_scale(node, sizes2);
-		node = isl_schedule_node_parent(node);
-	}
-
-	return isl_schedule_node_band_scale(node, sizes);
-}
-
-/* Return an isl_multi_aff, with as elements the parameters in "space"
- * that have the names specified by the elements in "names".
- * If (some of) these parameters do not already appear in "space",
- * then they are added first.
- */
-static __isl_give isl_multi_aff *parameter_vector(__isl_take isl_space *space,
-	__isl_keep isl_id_list *names)
-{
-	int i, n;
-	isl_local_space *ls;
-	isl_multi_aff *ma;
-
-	if (!names)
-		space = isl_space_free(space);
-
-	n = isl_id_list_n_id(names);
-	for (i = 0; i < n; ++i) {
-		int pos;
-		isl_id *id;
-
-		id = isl_id_list_get_id(names, i);
-		pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
-		if (pos >= 0) {
-			isl_id_free(id);
-			continue;
-		}
-		pos = isl_space_dim(space, isl_dim_param);
-		space = isl_space_add_dims(space, isl_dim_param, 1);
-		space = isl_space_set_dim_id(space, isl_dim_param, pos, id);
-	}
-	ma = isl_multi_aff_zero(isl_space_copy(space));
-	ls = isl_local_space_from_space(isl_space_domain(space));
-	for (i = 0; i < n; ++i) {
-		int pos;
-		isl_id *id;
-		isl_aff *aff;
-
-		id = isl_id_list_get_id(names, i);
-		pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
-		isl_id_free(id);
-		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
-					    isl_dim_param, pos);
-		ma = isl_multi_aff_set_aff(ma, i, aff);
-	}
-	isl_local_space_free(ls);
-
-	return ma;
-}
-
-/* Return constraints on the domain elements that equate a sequence of
- * parameters called "names", to the partial schedule
- * of "node" modulo the integers in "size".
- * The number of elements in the array "size" should be equal
- * to the number of elements in "names".
- * The number of members of the band node "node" should be smaller
- * than or equal to this number.  If it is smaller, then the first
- * elements of "names" are equated to zero.
- */
-static __isl_give isl_union_set *set_schedule_modulo(
-	__isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names,
-	int *size)
-{
-	int n, n_zero;
-	isl_space *space;
-	isl_multi_aff *ma;
-	isl_multi_union_pw_aff *mupa, *mupa2;
-	isl_multi_val *mv;
-	isl_union_set *domain;
-
-	if (!node)
-		return NULL;
-	n = isl_id_list_n_id(names);
-	if (n == 0)
-		return isl_schedule_node_get_universe_domain(node);
-	n_zero = n - isl_schedule_node_band_n_member(node);
-
-	mupa = isl_schedule_node_band_get_partial_schedule(node);
-	mv = construct_band_tiles_sizes(node, size + n_zero);
-	mupa = isl_multi_union_pw_aff_mod_multi_val(mupa, mv);
-
-	space = isl_multi_union_pw_aff_get_space(mupa);
-	space = isl_space_params(space);
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, n_zero);
-	ma = isl_multi_aff_zero(space);
-
-	domain = isl_schedule_node_get_universe_domain(node);
-	mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(
-						isl_union_set_copy(domain), ma);
-	mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa);
-
-	space = isl_multi_union_pw_aff_get_space(mupa);
-	ma = parameter_vector(space, names);
-
-	mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma);
-	mupa = isl_multi_union_pw_aff_sub(mupa, mupa2);
-
-	return isl_multi_union_pw_aff_zero_union_set(mupa);
-}
-
-/* Insert a context node at "node" introducing the block and thread
- * identifiers along with their bounds, which are stored in kernel->grid_size
- * and kernel->block_dim.
- * Note that the bounds on the block identifiers may implicitly impose
- * constraints on the parameters.  A guard needs to be inserted
- * in the schedule tree to ensure that those bounds hold at "node".
- * This guard is inserted in insert_guard.
- */
-static __isl_give isl_schedule_node *insert_context(struct ppcg_kernel *kernel,
-	__isl_take isl_schedule_node *node)
-{
-	isl_set *context;
-
-	context = isl_set_universe(isl_set_get_space(kernel->context));
-
-	context = add_bounded_parameters_dynamic(context,
-					kernel->grid_size, kernel->block_ids);
-	context = add_bounded_parameters(context,
-					kernel->block_dim, kernel->thread_ids);
-
-	node = isl_schedule_node_insert_context(node, context);
-
-	return node;
-}
-
-/* Insert a guard that eliminates kernel launches where the kernel
- * obviously does not have any work to do.
- *
- * In particular, eliminate kernel launches where there are obviously
- * zero blocks.
- * Use the same block size constraints that are used to create the context
- * to ensure that all constraints implicit in the constructed context
- * are imposed by the guard.
- *
- * Additionally, add other constraints that are valid
- * for each executed instance ("context"), as long as this does not result
- * in a disjunction.
- */
-static __isl_give isl_schedule_node *insert_guard(
-	__isl_take isl_schedule_node *node, __isl_keep isl_set *context,
-	__isl_keep isl_multi_pw_aff *size, struct ppcg_scop *scop)
-{
-	unsigned nparam, n;
-	isl_set *guard;
-	isl_id_list *ids;
-
-	guard = isl_set_copy(context);
-	guard = isl_set_compute_divs(guard);
-	guard = isl_set_from_basic_set(isl_set_simple_hull(guard));
-
-	nparam = isl_set_dim(guard, isl_dim_param);
-	n = isl_multi_pw_aff_dim(size, isl_dim_out);
-	ids = ppcg_scop_generate_names(scop, n, "__ppcg_tmp");
-	guard = add_bounded_parameters_dynamic(guard, size, ids);
-	isl_id_list_free(ids);
-	guard = isl_set_project_out(guard, isl_dim_param, nparam, n);
-
-	node = isl_schedule_node_insert_guard(node, guard);
-
-	return node;
-}
-
-/* Does any array reference group mapping require the band that is mapped
- * to threads to be unrolled?
- */
-static int kernel_requires_unroll(struct ppcg_kernel *kernel)
-{
-	int i, j;
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *array = &kernel->array[i];
-
-		for (j = 0; j < array->n_group; ++j) {
-			struct gpu_array_ref_group *group = array->groups[j];
-			if (gpu_array_ref_group_requires_unroll(group))
-				return 1;
-		}
-	}
-
-	return 0;
-}
-
-/* Mark the given band node "node" for unrolling by the AST generator and
- * then sink it to the leaves of the schedule tree.
- * All dimensions of "node" are assumed to be coincident, such that this
- * sinking is a valid operation.
- */
-static __isl_give isl_schedule_node *unroll(__isl_take isl_schedule_node *node)
-{
-	node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
-
-	node = isl_schedule_node_band_sink(node);
-
-	return node;
-}
-
-/* Insert a synchronization node in the schedule tree of "node"
- * after the core computation of "kernel" at the level of the band
- * that is mapped to threads, except if that level is equal to
- * that of the band that is mapped to blocks or if there are no writes
- * to global or shared memory in the core computation that require
- * synchronization.
- * If there are any writes to shared memory and the shared memory
- * copying is performed at the same level, then synchronization
- * is needed between the core and the copying anyway, so we might
- * as well add it here.  If the copying is performed at a higher
- * level, then different iterations of intermediate schedule dimensions
- * may have a different mapping from between shared memory elements and
- * threads, such that synchronization is required after the core.
- * "node" is assumed to point to the kernel node.
- *
- * If the shared and the thread mark point to the same node, then make
- * sure the synchronization is inserted outside of the shared mark.
- */
-static __isl_give isl_schedule_node *add_sync(struct ppcg_kernel *kernel,
-	__isl_take isl_schedule_node *node)
-{
-	int depth;
-	int need_sync;
-
-	need_sync = any_global_or_shared_sync_writes(kernel);
-	if (need_sync < 0)
-		return isl_schedule_node_free(node);
-	if (!need_sync)
-		return node;
-
-	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	depth = isl_schedule_node_get_schedule_depth(node);
-	node = gpu_tree_move_up_to_kernel(node);
-	if (depth == isl_schedule_node_get_schedule_depth(node))
-		return node;
-
-	node = gpu_tree_move_down_to_depth(node, depth, kernel->core);
-	node = gpu_tree_ensure_following_sync(node, kernel);
-
-	node = gpu_tree_move_up_to_kernel(node);
-
-	return node;
-}
-
-/* Return a read ("read" is 1) or write access relation for "group"
- * with those accesses removed that are only needed to communicate data
- * within the subtree of the schedule rooted at "node".
- * Furthermore, include the prefix schedule at "node".
- * That is, return a relation of the form
- *
- *	S -> [D -> A]
- *
- * with D the outer schedule dimensions at "node".
- */
-static __isl_give isl_union_map *anchored_non_local_accesses(
-	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
-	__isl_take isl_schedule_node *node, int read)
-{
-	isl_union_map *access;
-	isl_union_map *prefix;
-
-	prefix = isl_schedule_node_get_prefix_schedule_relation(node);
-	prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
-			    isl_union_pw_multi_aff_copy(kernel->contraction));
-	access = gpu_array_ref_group_access_relation(group, read, !read);
-	access = remove_local_accesses_group(kernel, group, access, prefix,
-						read);
-	access = isl_union_map_range_product(prefix, access);
-
-	return access;
-}
-
-/* Given an array reference group "group", create a mapping
- *
- *	read[D -> A] -> [D -> A]
- *
- * if "read" is set or
- *
- *	write[D -> A] -> [D -> A]
- *
- * if "read" is not set.
- * D corresponds to the outer tile->depth dimensions of
- * the kernel schedule.
- */
-static __isl_give isl_multi_aff *create_from_access(isl_ctx *ctx,
-	struct gpu_array_ref_group *group, int read)
-{
-	struct gpu_array_tile *tile;
-	isl_space *space;
-	isl_id *id;
-
-	tile = gpu_array_ref_group_tile(group);
-	space = isl_space_copy(group->array->space);
-	space = isl_space_from_range(space);
-	space = isl_space_add_dims(space, isl_dim_in, tile->depth);
-	space = isl_space_wrap(space);
-	space = isl_space_map_from_set(space);
-
-	id = isl_id_alloc(ctx, read ? "read" : "write", group);
-	space = isl_space_set_tuple_id(space, isl_dim_in, id);
-
-	return isl_multi_aff_identity(space);
-}
-
-/* If any writes in "group" require synchronization, then make sure
- * that there is a synchronization node for "kernel" after the node
- * following "node" in a sequence.
- *
- * If "shared" is set and no synchronization is needed for
- * the writes to global memory, then add synchronization before
- * the kernel to protect shared memory from being overwritten
- * by the next iteration of the core computation.
- * No additional synchronization is needed to protect against
- * the next copy into shared memory because each element of
- * the shared memory tile is always copied by the same thread.
- */
-static __isl_give isl_schedule_node *add_group_write_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel,
-	struct gpu_array_ref_group *group, int shared)
-{
-	int need_sync;
-
-	need_sync = any_sync_writes_in_group(kernel, group);
-	if (need_sync < 0)
-		return isl_schedule_node_free(node);
-	if (need_sync) {
-		node = isl_schedule_node_parent(node);
-		node = isl_schedule_node_next_sibling(node);
-		node = isl_schedule_node_child(node, 0);
-		node = gpu_tree_ensure_following_sync(node, kernel);
-	} else if (shared) {
-		struct gpu_array_tile *tile;
-
-		tile = gpu_array_ref_group_tile(group);
-		node = isl_schedule_node_parent(node);
-		node = isl_schedule_node_parent(node);
-		node = gpu_tree_move_down_to_depth(node, tile->depth,
-							kernel->core);
-		node = gpu_tree_move_left_to_sync(node, kernel);
-	}
-
-	return node;
-}
-
-/* Add copy statements to the schedule tree of "node"
- * for reading from global memory to private memory (if "read" is set) or
- * for writing back from private memory to global memory
- * (if "read" is not set) for the array reference group "group" that
- * is mapped to private memory.
- * On input, "node" points to the kernel node, and it is moved
- * back there on output.
- *
- * The copies are performed in the order of the array elements.
- * The copy statement instances include a reference to the outer
- * tile->depth dimensions of the kernel schedule for ease of
- * combining them with the group tiling.
- *
- * That is, the extra schedule is of the form
- *
- *	type[D -> A] -> A
- *
- * where D corresponds to the outer tile->depth dimensions of
- * the kernel schedule and A to the global array.
- * This schedule is unrolled because registers are not addressable.
- *
- * The copying is inserted in the schedule tree through an extension
- * of the form
- *
- *	D -> type[D -> A]
- *
- * where the extra domain elements type[D -> A] are those accessed
- * by the group.
- * A filter is inserted on type[D -> A] to ensure that the element
- * is read/written by the same thread that needs the element.
- * This filter is obtained by applying
- *
- *	S -> type[D -> A]
- *
- * to the thread filter for the core statements.
- *
- * The extension is inserted before the core computation in case of a read
- * and after the core computation in case of a write.
- * In the latter case, we also make sure that there is a synchronization
- * node after the write to global memory, unless this write is performed
- * at the outer level of the kernel.
- * In principle, this synchronization could be inserted higher
- * in the schedule tree depending on where the corresponding reads
- * from global memory are performed.
- */
-static __isl_give isl_schedule_node *add_copies_group_private(
-	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
-	__isl_take isl_schedule_node *node, int read)
-{
-	struct gpu_array_tile *tile;
-	isl_union_map *access;
-	isl_union_set *domain;
-	isl_space *space;
-	isl_multi_aff *from_access;
-	isl_multi_pw_aff *mpa;
-	isl_multi_union_pw_aff *mupa;
-	isl_union_pw_multi_aff *contraction;
-	isl_schedule_node *graft;
-	isl_union_set *filter;
-	int kernel_depth;
-	int empty;
-
-	kernel_depth = isl_schedule_node_get_schedule_depth(node);
-	tile = gpu_array_ref_group_tile(group);
-	node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core);
-
-	access = anchored_non_local_accesses(kernel, group, node, read);
-	empty = isl_union_map_is_empty(access);
-	if (empty < 0 || empty) {
-		isl_union_map_free(access);
-		if (empty < 0)
-			return isl_schedule_node_free(node);
-		return gpu_tree_move_up_to_kernel(node);
-	}
-
-	group->array->global = 1;
-	group->local_array->global = 1;
-
-	from_access = create_from_access(kernel->ctx, group, read);
-	space = isl_space_domain(isl_multi_aff_get_space(from_access));
-	access = isl_union_map_preimage_range_multi_aff(access, from_access);
-
-	filter = isl_union_set_copy(kernel->thread_filter);
-	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
-	filter = isl_union_set_preimage_union_pw_multi_aff(filter, contraction);
-	filter = isl_union_set_apply(filter, isl_union_map_copy(access));
-	filter = isl_union_set_detect_equalities(filter);
-	filter = isl_union_set_coalesce(filter);
-
-	domain = isl_union_map_range(access);
-	access = isl_union_set_wrapped_domain_map(domain);
-	access = isl_union_map_reverse(access);
-	access = isl_union_map_coalesce(access);
-	graft = isl_schedule_node_from_extension(access);
-
-	space = isl_space_map_from_set(space);
-	mpa = isl_multi_pw_aff_identity(space);
-	mpa = isl_multi_pw_aff_range_factor_range(mpa);
-	mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);
-
-	graft = isl_schedule_node_child(graft, 0);
-	graft = isl_schedule_node_insert_partial_schedule(graft, mupa);
-	graft = unroll(graft);
-
-	graft = isl_schedule_node_insert_filter(graft, filter);
-
-	graft = isl_schedule_node_parent(graft);
-
-	if (read)
-		node = isl_schedule_node_graft_before(node, graft);
-	else {
-		node = isl_schedule_node_graft_after(node, graft);
-		if (kernel_depth < tile->depth)
-			node = add_group_write_sync(node, kernel, group, 0);
-	}
-
-	node = gpu_tree_move_up_to_kernel(node);
-
-	return node;
-}
-
-/* Add copy statements to the schedule tree of "node"
- * for reading from global memory to shared memory (if "read" is set) or
- * for writing back from shared memory to global memory
- * (if "read" is not set) for the array reference group "group" that
- * is mapped to shared memory.
- * On input, "node" points to the kernel node, and it is moved
- * back there on output.
- *
- * The copies are performed in the order of the corresponding shared
- * memory tile.
- * The copy statement instances include a reference to the outer
- * tile->depth dimensions of the kernel schedule for ease of
- * combining them with the group tiling.
- *
- * If we are performing a read from global memory to shared memory and
- * if the array involved is not a scalar, then we copy
- * the entire tile to shared memory.  This may result in some extra
- * elements getting copied, but it should lead to simpler code
- * (which means that fewer registers may be needed) and less divergence.
- *
- * Otherwise, we only copy the elements that will be read or have been written
- * in the kernel.
- *
- * That is, the extra schedule is of the form
- *
- *	type[D -> A] -> T
- *
- * where D corresponds to the outer tile->depth dimensions of
- * the kernel schedule, A to the global array and T is the corresponding
- * shared memory tile.
- *
- * The copying is inserted in the schedule tree through an extension
- * of the form
- *
- *	D -> type[D -> A]
- *
- * where the extra domain elements type[D -> A] are those accessed
- * by the group.  In the case of read from a non-scalar, this set
- * is replaced by the entire shared memory tile.
- *
- * If the "unroll_copy_shared" option is set, then the AST generator
- * is instructed to unroll the copying code.
- *
- * A filter is inserted on type[D -> A] to map the copy instances
- * to the threads.  In particular, the thread identifiers are
- * equated to the position inside the shared memory tile (T)
- * modulo the block size.
- * We try to align the innermost tile dimension with the innermost
- * thread identifier (x) as a heuristic to improve coalescing.
- * In particular, if the dimension of the tile is greater than
- * the dimension of the block, then the schedule mapping to the tile
- * is broken up into two pieces and the filter is applied to the inner part.
- * If, on the other hand, the dimension of the tile is smaller than
- * the dimension of the block, then the initial thread identifiers
- * are equated to zero and the remaining thread identifiers are
- * matched to the memory tile.
- *
- * The extension is inserted before the core computation in case of a read
- * and after the core computation in case of a write.
- * In the case of a read, we first need to make sure there is some
- * synchronization before the core computation such that we can put the read
- * from global memory to shared memory before that synchronization.
- * This ensures that all threads have finished copying into shared memory
- * before the shared memory is used.
- * We also need to make sure that there is a synchronization node after
- * the core computation to ensure that the next load into shared memory
- * only happens after all data has been used.  There is no need for
- * this synchronization if we are at the outer level since then there
- * won't be a next load.
- * In the case of a write, we need to make sure there is some synchronization
- * after the core computation such taht we can put the write from shared
- * memory to global memory after that synchronization.
- * Unless we are at the outer level, we also need a synchronization node
- * after the write to ensure the data is saved to global memory
- * before the next iteration write to the same shared memory.
- * It also makes sure the data has arrived in global memory before
- * it is read in a subsequent iteration.
- */
-static __isl_give isl_schedule_node *add_copies_group_shared(
-	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
-	__isl_take isl_schedule_node *node, int read)
-{
-	struct gpu_array_tile *tile;
-	isl_union_map *access;
-	isl_union_set *domain;
-	isl_multi_aff *ma;
-	isl_multi_aff *from_access;
-	isl_multi_pw_aff *mpa;
-	isl_multi_union_pw_aff *mupa;
-	isl_schedule_node *graft;
-	isl_union_set *filter;
-	int skip;
-	int kernel_depth;
-	int empty;
-
-	tile = gpu_array_ref_group_tile(group);
-	kernel_depth = isl_schedule_node_get_schedule_depth(node);
-	node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core);
-
-	access = anchored_non_local_accesses(kernel, group, node, read);
-	empty = isl_union_map_is_empty(access);
-	if (empty < 0 || empty) {
-		isl_union_map_free(access);
-		if (empty < 0)
-			return isl_schedule_node_free(node);
-		return gpu_tree_move_up_to_kernel(node);
-	}
-
-	group->array->global = 1;
-	group->local_array->global = 1;
-
-	from_access = create_from_access(kernel->ctx, group, read);
-
-	ma = isl_multi_aff_copy(tile->tiling);
-	ma = isl_multi_aff_pullback_multi_aff(ma,
-					    isl_multi_aff_copy(from_access));
-	mpa = isl_multi_pw_aff_from_multi_aff(ma);
-	mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);
-
-	domain = isl_union_map_range(access);
-
-	if (read && !gpu_array_is_scalar(group->array)) {
-		isl_map *map;
-		isl_union_set_free(domain);
-		map = group_tile(group);
-		domain = isl_union_set_from_set(isl_map_wrap(map));
-	}
-
-	domain = isl_union_set_preimage_multi_aff(domain, from_access);
-	access = isl_union_set_wrapped_domain_map(domain);
-	access = isl_union_map_reverse(access);
-	access = isl_union_map_coalesce(access);
-	graft = isl_schedule_node_from_extension(access);
-
-	graft = isl_schedule_node_child(graft, 0);
-
-	graft = isl_schedule_node_insert_partial_schedule(graft, mupa);
-	if (kernel->options->unroll_copy_shared)
-		graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_unroll);
-
-	if (tile->n > kernel->n_block && kernel->n_block > 0) {
-		graft = isl_schedule_node_band_split(graft,
-						tile->n - kernel->n_block);
-		graft = isl_schedule_node_child(graft, 0);
-	}
-	if (tile->n < kernel->n_block)
-		skip = kernel->n_block - tile->n;
-	else
-		skip = 0;
-	filter = set_schedule_modulo(graft, kernel->thread_ids,
-					kernel->block_dim);
-	if (!kernel->options->wrap)
-		graft = snap_band_to_sizes(graft, kernel->block_dim + skip,
-			    kernel->options);
-	if (tile->n > kernel->n_block && kernel->n_block > 0)
-		graft = isl_schedule_node_parent(graft);
-	graft = isl_schedule_node_insert_filter(graft, filter);
-
-	while (graft && isl_schedule_node_has_parent(graft))
-		graft = isl_schedule_node_parent(graft);
-
-	if (read) {
-		if (kernel_depth < tile->depth)
-			node = gpu_tree_ensure_sync_after_core(node, kernel);
-		node = gpu_tree_move_left_to_sync(node, kernel);
-		node = isl_schedule_node_graft_before(node, graft);
-	} else {
-		node = gpu_tree_move_right_to_sync(node, kernel);
-		node = isl_schedule_node_graft_after(node, graft);
-		if (kernel_depth < tile->depth)
-			node = add_group_write_sync(node, kernel, group, 1);
-	}
-
-	node = gpu_tree_move_up_to_kernel(node);
-
-	return node;
-}
-
-/* Check whether the array reference group "group" is mapped to
- * private or shared memory and, if so,
- * add copy statements to the schedule tree of "node"
- * for reading from global memory to private or shared memory
- * (if "read" is set) or for writing back from private or shared memory
- * to global memory (if "read" is not set) for this group.
- * On input, "node" points to the kernel node, and it is moved
- * back there on output.
- */
-static __isl_give isl_schedule_node *add_copies_group(
-	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
-	__isl_take isl_schedule_node *node, int read)
-{
-	enum ppcg_group_access_type type;
-
-	type = gpu_array_ref_group_type(group);
-	if (type == ppcg_access_private)
-		return add_copies_group_private(kernel, group, node, read);
-	if (type == ppcg_access_shared)
-		return add_copies_group_shared(kernel, group, node, read);
-	return node;
-}
-
-/* For each array reference group that is mapped to private or shared memory,
- * add copy statements to the schedule tree of "node"
- * for reading from global memory to private or shared memory
- * and for writing back.
- * On input, "node" points to the kernel node, and it is moved
- * back there on output.
- */
-static __isl_give isl_schedule_node *add_copies(struct ppcg_kernel *kernel,
-	__isl_take isl_schedule_node *node)
-{
-	int i, j;
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *array = &kernel->array[i];
-
-		for (j = 0; j < array->n_group; ++j) {
-			struct gpu_array_ref_group *group = array->groups[j];
-
-			node = add_copies_group(kernel, group, node, 1);
-			if (!node)
-				return NULL;
-			node = add_copies_group(kernel, group, node, 0);
-			if (!node)
-				return NULL;
-		}
-	}
-
-	return node;
-}
-
-/* Mark all dimensions in the current band node atomic.
- */
-static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node)
-{
-	return ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
-}
-
-/* Mark "node" atomic, if it is a band node.
- * Do the same for all ancestors.
- * Return a pointer to "node" (in the updated schedule tree).
- */
-static __isl_give isl_schedule_node *atomic_ancestors(
-	__isl_take isl_schedule_node *node)
-{
-	int pos;
-
-	if (!node)
-		return NULL;
-	if (!isl_schedule_node_has_parent(node))
-		return node;
-
-	pos = isl_schedule_node_get_child_position(node);
-	node = isl_schedule_node_parent(node);
-	if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
-		node = atomic(node);
-	node = atomic_ancestors(node);
-	node = isl_schedule_node_child(node, pos);
-
-	return node;
-}
-
-/* Collect all write references that require synchronization.
- * "node" is assumed to point to the kernel node.
- * Each reference is represented by a universe set in a space
- *
- *	[S[i,j] -> R[]]
- *
- * with S[i,j] the statement instance space and R[] the array reference.
- *
- * This function should be called before block and thread filters are added.
- *
- * Synchronization is needed after a write if there is a subsequent read
- * within the same block that may not be performed by the same thread.
- * There should not be any dependences between different blocks,
- * so we start with the flow dependences within the same kernel invocation
- * and we subtract from these those dependences that are mapped
- * to the same iteration of the bands where synchronization is inserted.
- * We do not remove pairs of instances that are known to map to
- * the same thread across different iterations of the intermediate
- * bands because the read may be performed by a different thread
- * than the one that needs the value if shared memory is involved.
- *
- * We also consider all pairs of possible writes that access the same
- * memory location and that may be mapped to the same block but not
- * to the same iteration of the intermediate bands.
- * In theory, it would be possible for one thread to still be in
- * a previous iteration of a loop in these bands.
- * A write to global memory in this delayed thread could then overwrite
- * a write from another thread that has already moved on to
- * the next iteration.
- *
- * After computing the above writes paired off with reads or writes
- * that depend on them, we project onto the domain writes.
- * Sychronization is needed after writes to global memory
- * through these references.
- */
-static __isl_give isl_union_set *compute_sync_writes(
-	struct ppcg_kernel *kernel, __isl_keep isl_schedule_node *node)
-{
-	isl_union_map *local;
-	isl_union_map *may_writes, *shared_access;
-	isl_union_map *kernel_prefix, *thread_prefix;
-	isl_union_map *equal;
-	isl_union_set *wrap;
-	isl_union_set *domain;
-	isl_union_pw_multi_aff *contraction;
-
-	kernel_prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
-	node = isl_schedule_node_copy(node);
-	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	thread_prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
-	isl_schedule_node_free(node);
-
-	contraction = kernel->contraction;
-	kernel_prefix = isl_union_map_preimage_domain_union_pw_multi_aff(
-		    kernel_prefix, isl_union_pw_multi_aff_copy(contraction));
-	thread_prefix = isl_union_map_preimage_domain_union_pw_multi_aff(
-		    thread_prefix, isl_union_pw_multi_aff_copy(contraction));
-	domain = isl_union_set_copy(kernel->expanded_domain);
-	domain = isl_union_set_universe(domain);
-
-	may_writes = isl_union_map_copy(kernel->prog->scop->tagged_may_writes);
-	may_writes = isl_union_map_curry(may_writes);
-	may_writes = isl_union_map_intersect_domain(may_writes, domain);
-	may_writes = isl_union_map_uncurry(may_writes);
-	shared_access = isl_union_map_copy(may_writes);
-	shared_access = isl_union_map_apply_range(shared_access,
-					isl_union_map_reverse(may_writes));
-
-	local = isl_union_map_copy(kernel->prog->scop->tagged_dep_flow);
-	local = isl_union_map_union(local, shared_access);
-	local = isl_union_map_zip(local);
-
-	equal = isl_union_map_apply_range(kernel_prefix,
-		    isl_union_map_reverse(isl_union_map_copy(kernel_prefix)));
-	wrap = isl_union_map_wrap(equal);
-	local = isl_union_map_intersect_domain(local, wrap);
-	equal = isl_union_map_apply_range(thread_prefix,
-		    isl_union_map_reverse(isl_union_map_copy(thread_prefix)));
-	wrap = isl_union_map_wrap(equal);
-	local = isl_union_map_subtract_domain(local, wrap);
-
-	local = isl_union_map_zip(local);
-	local = isl_union_map_universe(local);
-
-	return isl_union_map_domain(local);
-}
-
-/* Group the domain elements into a single space, named kernelX,
- * with X the kernel sequence number "kernel_id".
- */
-static __isl_give isl_schedule_node *group_statements(
-	__isl_take isl_schedule_node *node, int kernel_id)
-{
-	char buffer[20];
-	isl_id *id;
-
-	if (!node)
-		return NULL;
-
-	snprintf(buffer, sizeof(buffer), "kernel%d", kernel_id);
-	id = isl_id_alloc(isl_schedule_node_get_ctx(node), buffer, NULL);
-	return isl_schedule_node_group(node, id);
-}
-
-/* Create a ppcg_kernel representing the domain instances that reach "node"
- * and insert a mark node pointing to the ppcg_kernel before "node".
- * The band that "node" points to is the band that needs to be mapped
- * to block identifiers.  The band that needs to be mapped to thread
- * identifiers should be marked by a "thread" mark by the caller.
- * The linear branch between the current node and the "thread" mark
- * may also have a "shared" mark.  If present, the mapping to shared
- * memory is computed at that point.
- * Both marks are removed by this function.
- * If "scale" is set, then the band that "node" points to is scaled
- * by "sizes".
- *
- * Mark all outer band nodes as atomic to ensure each kernel is only
- * scheduled once.
- * If the domain elements that reach "node" live in more than one space,
- * then group the domain elements into a single space, named kernelX,
- * with X the kernel sequence number.
- *
- * Insert a guard node governing the kernel node to ensure that
- * no kernels with zero blocks are launched.
- *
- * Insert a context node describing the block and thread
- * identifiers inside the kernel mark.
- * The context node needs to be inserted after the effective block size
- * has been determined such that the bounds on the thread identifiers
- * would reflect the effective block size.
- * Insert a filter node inside the context node mapping the statement
- * instances to block identifiers.  In particular, the block identifiers
- * are equated to the partial schedule of band that was marked for mapping
- * to blocks modulo the grid size.
- * Insert a filter node inside the "thread" mark mapping the statement
- * instances to thread identifiers.  In particular, the thread identifiers
- * are equated to the partial schedule of band that was marked for mapping
- * to threads modulo the block size.
- *
- * Compute array reference groups for all arrays, set the local
- * array bounds based on the set of domain instances that reach
- * the kernel node, check the total amount of shared memory used
- * and compute all group tilings.
- * The array reference groups are computed after the block filter
- * has been inserted because it affects the mapping to shared or
- * private memory.  This computation also requires the thread filter
- * (in the ppcg_kernel object), but this thread filter should not
- * have been added to the schedule tree yet since the computation
- * requires the schedule of the band that needs to be mapped to
- * threads before the privatization is applied.
- *
- * If any array reference group requires the band mapped to threads
- * to be unrolled, then we perform the required unrolling.
- *
- * We save a copy of the schedule that may influence the mappings
- * to shared or private memory in kernel->copy_schedule.
- *
- * Finally, we add synchronization and copy statements to the schedule tree,
- * remove the "thread" mark and create representations for the local
- * variables in the kernel.
- *
- * We keep a copy of the isl_id that points to the kernel to ensure
- * that the kernel does not get destroyed if the schedule node
- * is freed due to some error condition.
- */
-__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node, int scale,
-	__isl_keep isl_multi_val *sizes)
-{
-	struct ppcg_kernel *kernel;
-	isl_id *id;
-	isl_schedule_node *node_thread;
-	isl_union_map *host_schedule;
-	isl_union_pw_multi_aff *contraction;
-	isl_set *host_domain;
-	isl_union_set *domain, *expanded;
-	int single_statement;
-
-	node = gpu_tree_insert_shared_before_thread(node);
-	if (!node)
-		return NULL;
-
-	kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
-	kernel = ppcg_kernel_create_local_arrays(kernel, gen->prog);
-	if (!kernel)
-		return isl_schedule_node_free(node);
-
-	domain = isl_schedule_node_get_domain(node);
-	single_statement = isl_union_set_n_set(domain) == 1;
-
-	kernel->ctx = gen->ctx;
-	kernel->prog = gen->prog;
-	kernel->options = gen->options;
-	kernel->context = extract_context(node, gen->prog);
-	kernel->core = isl_union_set_universe(isl_union_set_copy(domain));
-	contraction = isl_schedule_node_get_subtree_contraction(node);
-	kernel->contraction = isl_union_pw_multi_aff_copy(contraction);
-	expanded = isl_union_set_copy(domain);
-	expanded = isl_union_set_preimage_union_pw_multi_aff(expanded,
-						contraction);
-	kernel->expanded_domain = isl_union_set_copy(expanded);
-	kernel->arrays = accessed_by_domain(expanded, gen->prog);
-	kernel->n_grid = n_outer_coincidence(node);
-	node_thread = isl_schedule_node_copy(node);
-	node_thread = gpu_tree_move_down_to_thread(node_thread, kernel->core);
-	node_thread = isl_schedule_node_child(node_thread, 0);
-	kernel->n_block = n_outer_coincidence(node_thread);
-	isl_schedule_node_free(node_thread);
-	kernel->id = gen->kernel_id++;
-	read_grid_and_block_sizes(kernel, gen);
-
-	kernel->sync_writes = compute_sync_writes(kernel, node);
-
-	host_schedule = isl_schedule_node_get_prefix_schedule_union_map(node);
-	host_domain = isl_set_from_union_set(isl_union_map_range(
-								host_schedule));
-
-	node = atomic_ancestors(node);
-
-	id = isl_id_alloc(gen->ctx, "kernel", kernel);
-	id = isl_id_set_free_user(id, &ppcg_kernel_free_wrap);
-	node = isl_schedule_node_insert_mark(node, isl_id_copy(id));
-
-	if (!single_statement)
-		node = group_statements(node, kernel->id);
-
-	node = isl_schedule_node_child(node, 0);
-	node = split_band(node, kernel->n_grid);
-	kernel->block_ids = ppcg_scop_generate_names(gen->prog->scop,
-						kernel->n_grid, "b");
-	kernel->block_filter = set_schedule_modulo(node, kernel->block_ids,
-						kernel->grid_dim);
-	kernel->grid_size = extract_grid_size(kernel,
-						isl_union_set_copy(domain));
-	if (!kernel->options->wrap)
-		node = snap_band_to_sizes(node, kernel->grid_dim,
-						kernel->options);
-	if (scale)
-		node = scale_band(node, isl_multi_val_copy(sizes));
-	node = isl_schedule_node_parent(node);
-	if (!single_statement)
-		node = isl_schedule_node_parent(node);
-	node = insert_guard(node, kernel->context, kernel->grid_size,
-				gen->prog->scop);
-	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	node = isl_schedule_node_child(node, 0);
-	node = split_band(node, kernel->n_block);
-	kernel->thread_ids = ppcg_scop_generate_names(gen->prog->scop,
-						kernel->n_block, "t");
-	kernel->thread_filter = set_schedule_modulo(node, kernel->thread_ids,
-						kernel->block_dim);
-	if (extract_block_size(kernel, domain) < 0)
-		node = isl_schedule_node_free(node);
-
-	node = gpu_tree_move_up_to_kernel(node);
-	node = isl_schedule_node_child(node, 0);
-	node = insert_context(kernel, node);
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_insert_filter(node,
-				    isl_union_set_copy(kernel->block_filter));
-
-	node = gpu_tree_move_up_to_kernel(node);
-
-	if (gpu_group_references(kernel, node) < 0)
-		node = isl_schedule_node_free(node);
-	localize_bounds(kernel, host_domain);
-	isl_set_free(host_domain);
-
-	check_shared_memory_bound(kernel);
-	mark_global_arrays(kernel);
-	compute_group_tilings(kernel);
-
-	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	node = isl_schedule_node_child(node, 0);
-	if (!kernel->options->wrap)
-		node = snap_band_to_sizes(node, kernel->block_dim,
-						kernel->options);
-	node = isl_schedule_node_insert_filter(node,
-				    isl_union_set_copy(kernel->thread_filter));
-	if (kernel_requires_unroll(kernel)) {
-		node = isl_schedule_node_child(node, 0);
-		node = unroll(node);
-	}
-
-	node = gpu_tree_move_up_to_thread(node);
-	kernel->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node);
-	kernel->copy_schedule =
-		isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
-	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
-	kernel->copy_schedule =
-		isl_union_pw_multi_aff_pullback_union_pw_multi_aff(
-					    kernel->copy_schedule, contraction);
-
-	node = gpu_tree_move_up_to_kernel(node);
-
-	node = add_sync(kernel, node);
-	node = add_copies(kernel, node);
-
-	node = gpu_tree_move_down_to_shared(node, kernel->core);
-	node = isl_schedule_node_delete(node);
-
-	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	node = isl_schedule_node_delete(node);
-
-	node = gpu_tree_move_up_to_kernel(node);
-
-	if (create_kernel_vars(kernel) < 0)
-		node = isl_schedule_node_free(node);
-
-	if (!single_statement)
-		node = isl_schedule_node_parent(node);
-	node = isl_schedule_node_parent(node);
-
-	isl_id_free(id);
-	return node;
-}
-
-/* Insert a zero-dimensional permutable band at "node".
- */
-static __isl_give isl_schedule_node *insert_empty_permutable_band(
-	__isl_take isl_schedule_node *node)
-{
-	isl_space *space;
-	isl_schedule *schedule;
-	isl_union_set *domain;
-	isl_multi_union_pw_aff *mupa;
-
-	schedule = isl_schedule_node_get_schedule(node);
-	domain = isl_schedule_get_domain(schedule);
-	space = isl_union_set_get_space(domain);
-	isl_union_set_free(domain);
-	isl_schedule_free(schedule);
-
-	space = isl_space_set_from_params(space);
-	mupa = isl_multi_union_pw_aff_zero(space);
-	node = isl_schedule_node_insert_partial_schedule(node, mupa);
-	node = isl_schedule_node_band_set_permutable(node, 1);
-
-	return node;
-}
-
-/* See if hybrid tiling can be performed on "node" and its parent.
- * If so, apply hybrid tiling and return the updated schedule tree.
- * If not, return the original schedule tree.
- * Return NULL on error.
- *
- * First check if "node", together with its parent, meets
- * the basic requirements for hybrid tiling.
- * If so, compute the relative dependence distances of "node"
- * with respect to its parent and check if they are sufficiently bounded.
- * If so, apply hybrid tiling using user specified tile sizes.
- *
- * The tile sizes are read before the dependence distance bounds are
- * computed, because the user may have specified fewer dimensions
- * than are available.  In this case, the remaining schedule dimensions
- * are split off and the dependence distances should be computed
- * after these dimensions have been split off.
- */
-static __isl_give isl_schedule_node *try_hybrid_tile(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node)
-{
-	int tile_len;
-	int *tile_size;
-	isl_bool ok;
-	isl_schedule_node *orig = node;
-	ppcg_ht_bounds *bounds;
-
-	ok = ppcg_ht_parent_has_input_pattern(node);
-	if (ok < 0)
-		return isl_schedule_node_free(node);
-	if (!ok)
-		return orig;
-
-	tile_len = 1 + isl_schedule_node_band_n_member(node);
-	tile_size = read_tile_sizes(gen, &tile_len);
-	if (!tile_size)
-		return isl_schedule_node_free(node);
-
-	node = isl_schedule_node_copy(node);
-	node = split_band(node, tile_len - 1);
-	node = isl_schedule_node_parent(node);
-	bounds = ppcg_ht_compute_bounds(gen->prog->scop, node);
-	node = isl_schedule_node_child(node, 0);
-
-	ok = ppcg_ht_bounds_is_valid(bounds);
-	if (ok >= 0 && ok)
-		node = gpu_hybrid_tile(gen, node, bounds, tile_size);
-	else
-		ppcg_ht_bounds_free(bounds);
-	free(tile_size);
-
-	if (ok >= 0 && !ok) {
-		isl_schedule_node_free(node);
-		return orig;
-	}
-	isl_schedule_node_free(orig);
-	if (ok < 0)
-		return isl_schedule_node_free(node);
-	return node;
-}
-
-/* If "node" is the outermost permutable band that can be mapped to block and
- * thread identifiers in its branch (or the root of a subtree with
- * no such outer bands),
- * then mark the band as such, attaching a ppcg_kernel to the mark.
- *
- * If hybrid tiling is allowed, then first try and apply it
- * to "node" and its parent.
- *
- * If "node" is the root of a subtree without permutable bands,
- * then insert a zero-dimensional permutable band such that
- * we can assume that "node" always points to a band node.
- * This includes the case where "node" already points to a band node,
- * but one without any coincident dimension.  In this case,
- * the extra node ensures that this original node does not get tiled.
- *
- * Tile "node" using user specified tile sizes, after splitting the band
- * if the number of specified tile sizes is smaller than the dimension
- * of the band.  Mark the point band of this tiling as the band that
- * needs to be mapped to threads and instruct the AST generator to unroll
- * the band if the "unroll_gpu_tile" option is set.
- * Create a kernel representing the domain instances that reach "node" and
- * insert a mark node pointing to the ppcg_kernel before the band node.
- */
-static __isl_give isl_schedule_node *mark_outer_permutable(
-	__isl_take isl_schedule_node *node, void *user)
-{
-	struct gpu_gen *gen = user;
-	int outer;
-	int scale;
-	int tile_len;
-	int *tile_size;
-	isl_id *id;
-	isl_multi_val *sizes;
-
-	outer = is_outer_tilable(node);
-	if (outer < 0)
-		return isl_schedule_node_free(node);
-	if (!outer)
-		return node;
-
-	if (gen->options->hybrid) {
-		isl_schedule_node *saved = isl_schedule_node_copy(node);
-		node = try_hybrid_tile(gen, node);
-		isl_schedule_node_free(saved);
-		if (node != saved)
-			return node;
-	}
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_band ||
-	    !isl_schedule_node_band_member_get_coincident(node, 0))
-		node = insert_empty_permutable_band(node);
-
-	tile_len = isl_schedule_node_band_n_member(node);
-	tile_size = read_tile_sizes(gen, &tile_len);
-	if (!tile_size)
-		return isl_schedule_node_free(node);
-	if (tile_len < isl_schedule_node_band_n_member(node))
-		node = isl_schedule_node_band_split(node, tile_len);
-	sizes = construct_band_tiles_sizes(node, tile_size);
-	node = tile_band(node, isl_multi_val_copy(sizes));
-	node = isl_schedule_node_child(node, 0);
-	if (gen->options->unroll_gpu_tile)
-		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
-	id = isl_id_alloc(gen->ctx, "thread", NULL);
-	node = isl_schedule_node_insert_mark(node, id);
-	node = isl_schedule_node_parent(node);
-
-	scale = gen->options->scale_tile_loops;
-	node = gpu_create_kernel(gen, node, scale, sizes);
-	isl_multi_val_free(sizes);
-	free(tile_size);
-
-	return node;
-}
-
-/* Given a set or sequence node, return the union the filters of either all
- * (if "only_initial" is not set) or the initial (if "only_initial" is set)
- * direct subtrees that do not contain any suitably permutable bands
- * (according to subtree_has_permutable_bands).
- */
-static __isl_give isl_union_set *get_non_parallel_subtree_filters(
-	__isl_keep isl_schedule_node *node, int only_initial)
-{
-	isl_space *space;
-	isl_union_set *filter;
-	int i, n;
-
-	n = isl_schedule_node_n_children(node);
-	if (n < 0)
-		return NULL;
-
-	node = isl_schedule_node_copy(node);
-	node = isl_schedule_node_child(node, 0);
-	filter = isl_schedule_node_filter_get_filter(node);
-	node = isl_schedule_node_parent(node);
-	space = isl_union_set_get_space(filter);
-	isl_union_set_free(filter);
-	filter = isl_union_set_empty(space);
-
-	for (i = 0; i < n; ++i) {
-		int parallelism;
-
-		node = isl_schedule_node_child(node, i);
-		parallelism = subtree_has_permutable_bands(node);
-		if (parallelism < 0) {
-			filter = isl_union_set_free(filter);
-		} else if (!parallelism) {
-			isl_union_set *filter_i;
-			filter_i = isl_schedule_node_filter_get_filter(node);
-			filter = isl_union_set_union(filter, filter_i);
-		} else if (only_initial)
-			break;
-		node = isl_schedule_node_parent(node);
-	}
-
-	isl_schedule_node_free(node);
-
-	return filter;
-}
-
-/* Given a set or sequence node, return the union of the filters of
- * the direct subtrees that do not contain any suitably permutable bands
- * (according to subtree_has_permutable_bands).
- */
-static __isl_give isl_union_set *get_all_non_parallel_subtree_filters(
-	__isl_keep isl_schedule_node *node)
-{
-	return get_non_parallel_subtree_filters(node, 0);
-}
-
-/* Given a set or sequence node, return the union of the filters of
- * the initial direct subtrees that do not contain any suitably permutable
- * bands (according to subtree_has_permutable_bands).
- */
-static __isl_give isl_union_set *get_initial_non_parallel_subtree_filters(
-	__isl_keep isl_schedule_node *node)
-{
-	return get_non_parallel_subtree_filters(node, 1);
-}
-
-/* Mark all variables that are accessed by the statement instances in "domain"
- * and that are local to "prog" as requiring a declaration in the host code.
- * The statement instances in "domain" correspond to (a subset of)
- * the active instances at "node".
- * "node" is not modified by this function, except that NULL is returned
- * in case of error.
- */
-static __isl_give isl_schedule_node *declare_accessed_local_variables(
-	__isl_take isl_schedule_node *node, struct gpu_prog *prog,
-	__isl_keep isl_union_set *domain)
-{
-	isl_union_pw_multi_aff *contraction;
-	isl_union_set *arrays;
-	int i;
-
-	if (!ppcg_scop_any_hidden_declarations(prog->scop))
-		return node;
-	contraction = isl_schedule_node_get_subtree_contraction(node);
-	domain = isl_union_set_copy(domain);
-	domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction);
-	arrays = accessed_by_domain(domain, prog);
-
-	for (i = 0; i < prog->n_array; ++i) {
-		isl_space *space;
-		isl_set *set;
-		int empty;
-
-		if (!prog->array[i].local)
-			continue;
-		space = isl_set_get_space(prog->array[i].extent);
-		set = isl_union_set_extract_set(arrays, space);
-		empty = isl_set_plain_is_empty(set);
-		isl_set_free(set);
-		if (empty < 0)
-			goto error;
-		if (!empty)
-			prog->array[i].declare_local = 1;
-	}
-
-	isl_union_set_free(arrays);
-	return node;
-error:
-	isl_union_set_free(arrays);
-	return isl_schedule_node_free(node);
-}
-
-/* If "node" points to a set node, then separate its children
- * into subtrees that have suitably permutable bands and
- * those that do not.
- * Adjust the schedule tree in order to execute the second group
- * after the first group and return a pointer to the first group,
- * assuming there are any such subtrees.
- * If "node" points to a sequence node, then separate the initial
- * children that do not have suitably permutable bands and
- * return a pointer to the subsequence of children that do have such bands,
- * assuming there are any such subtrees.
- *
- * In both cases, mark all local variables in "prog" that are accessed by
- * the group without permutable bands as requiring a declaration on the host.
- */
-static __isl_give isl_schedule_node *isolate_permutable_subtrees(
-	__isl_take isl_schedule_node *node, struct gpu_prog *prog)
-{
-	isl_union_set *filter;
-	enum isl_schedule_node_type type;
-
-	if (!node)
-		return NULL;
-	type = isl_schedule_node_get_type(node);
-	if (type == isl_schedule_node_set) {
-		filter = get_all_non_parallel_subtree_filters(node);
-		node = declare_accessed_local_variables(node, prog, filter);
-		node = isl_schedule_node_order_after(node, filter);
-	} else if (type == isl_schedule_node_sequence) {
-		filter = get_initial_non_parallel_subtree_filters(node);
-		node = declare_accessed_local_variables(node, prog, filter);
-		node = isl_schedule_node_order_before(node, filter);
-	}
-
-	return node;
-}
-
-/* Replace any reference to an array element in the range of "copy"
- * by a reference to all array elements (defined by the extent of the array).
- */
-static __isl_give isl_union_map *approximate_copy_out(
-	__isl_take isl_union_map *copy, struct gpu_prog *prog)
-{
-	int i;
-	isl_union_map *res;
-
-	res = isl_union_map_empty(isl_union_map_get_space(copy));
-
-	for (i = 0; i < prog->n_array; ++i) {
-		isl_space *space;
-		isl_set *set;
-		isl_union_map *copy_i;
-		isl_union_set *extent, *domain;
-
-		space = isl_space_copy(prog->array[i].space);
-		extent = isl_union_set_from_set(isl_set_universe(space));
-		copy_i = isl_union_map_copy(copy);
-		copy_i = isl_union_map_intersect_range(copy_i, extent);
-		set = isl_set_copy(prog->array[i].extent);
-		extent = isl_union_set_from_set(set);
-		domain = isl_union_map_domain(copy_i);
-		copy_i = isl_union_map_from_domain_and_range(domain, extent);
-		res = isl_union_map_union(res, copy_i);
-	}
-
-	isl_union_map_free(copy);
-
-	return res;
-}
-
-/* Insert "kernel" marks that point to a ppcg_kernel structure
- * in front of all outermost tilable band that (by construction)
- * have at least one parallel loop.
- */
-static __isl_give isl_schedule_node *mark_kernels(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node)
-{
-	return isl_schedule_node_map_descendant_bottom_up(node,
-						&mark_outer_permutable, gen);
-}
-
-/* Construct schedule constraints from the dependences in prog->scop and
- * the array order dependences in prog->array_order.
- *
- * If live range reordering is allowed, then we need to make sure
- * that live ranges on arrays are not run in parallel since doing
- * so would require array expansion.  We therefore add the array
- * order dependences to the coincidence dependences.  Non-zero array
- * order dependences will then prevent a schedule dimension from being
- * considered parallel.
- * Live ranges derived from scalars are allowed to be run in parallel
- * since we force the scalars to be mapped to private memory in
- * check_scalar_live_ranges.
- * If live range reordering is allowed, then the false dependences
- * are not added to the validity constraints as that would prevent
- * reordering.  Instead, the external false dependences that enforce that reads
- * from potentially live-in data precede any later write and
- * that writes of potentially live-out data follow any other earlier write
- * are added to the validity and the coincidence constraints.
- * The false dependences are still added to the proximity constraints
- * for consistency with the case where live range reordering is not allowed.
- * The coincidence constraints then consist of flow dependences,
- * external false dependences and array order dependences.
- * The independences can be filtered out from the first two sets.
- * They have already been filtered out from the array order dependences
- * on a per array basis in collect_order_dependences.
- * There is no need for a per array handling of the other two sets
- * as there should be no flow or external false dependence on local
- * variables that can be filtered out.
- */
-static __isl_give isl_schedule_constraints *construct_schedule_constraints(
-	struct gpu_prog *prog)
-{
-	isl_union_set *domain;
-	isl_union_map *dep_raw, *dep;
-	isl_union_map *validity, *proximity, *coincidence;
-	isl_schedule_constraints *sc;
-
-	domain = isl_union_set_copy(prog->scop->domain);
-	sc = isl_schedule_constraints_on_domain(domain);
-	sc = isl_schedule_constraints_set_context(sc,
-				isl_set_copy(prog->scop->context));
-	if (prog->scop->options->live_range_reordering) {
-		sc = isl_schedule_constraints_set_conditional_validity(sc,
-			isl_union_map_copy(prog->scop->tagged_dep_flow),
-			isl_union_map_copy(prog->scop->tagged_dep_order));
-		proximity = isl_union_map_copy(prog->scop->dep_flow);
-		validity = isl_union_map_copy(proximity);
-		validity = isl_union_map_union(validity,
-			    isl_union_map_copy(prog->scop->dep_forced));
-		proximity = isl_union_map_union(proximity,
-			    isl_union_map_copy(prog->scop->dep_false));
-		coincidence = isl_union_map_copy(validity);
-		coincidence = isl_union_map_subtract(coincidence,
-			isl_union_map_copy(prog->scop->independence));
-		coincidence = isl_union_map_union(coincidence,
-				isl_union_map_copy(prog->array_order));
-	} else {
-		dep_raw = isl_union_map_copy(prog->scop->dep_flow);
-		dep = isl_union_map_copy(prog->scop->dep_false);
-		dep = isl_union_map_union(dep, dep_raw);
-		dep = isl_union_map_coalesce(dep);
-		proximity = isl_union_map_copy(dep);
-		coincidence = isl_union_map_copy(dep);
-		validity = dep;
-	}
-	sc = isl_schedule_constraints_set_validity(sc, validity);
-	sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
-	sc = isl_schedule_constraints_set_proximity(sc, proximity);
-
-	if (prog->scop->options->debug->dump_schedule_constraints)
-		isl_schedule_constraints_dump(sc);
-	return sc;
-}
-
-/* Compute an appropriate schedule based on the accesses in
- * gen->read and gen->write.
- *
- * We derive schedule constraints from the dependences in gen->prog->scop
- * and then use isl to compute a schedule that has a parallel loop
- * in each tilable band.
- * During the schedule construction, some statement instances
- * may be grouped first based on the input schedule.
- */
-static __isl_give isl_schedule *compute_schedule(struct gpu_gen *gen)
-{
-	isl_schedule_constraints *sc;
-	isl_schedule *schedule;
-
-	sc = construct_schedule_constraints(gen->prog);
-	schedule = gen->prog->scop->schedule;
-	schedule = ppcg_compute_schedule(sc, schedule, gen->options);
-
-	return schedule;
-}
-
-/* If the band node "node" has exactly one member then mark it permutable.
- */
-static __isl_give isl_schedule_node *band_set_permutable(
-	__isl_take isl_schedule_node *node,
-	__isl_keep isl_schedule_constraints *sc)
-{
-	if (isl_schedule_node_band_n_member(node) == 1)
-		node = isl_schedule_node_band_set_permutable(node, 1);
-
-	return node;
-}
-
-/* Return the coincidence constraints between pairs of instances
- * that are scheduled together by the ancestors of "node".
- * That is, select those coincidence constraints that relate
- * pairs of instances that have the same value for the prefix schedule.
- * If the schedule depth is zero, then the prefix schedule does not
- * contain any information, so we intersect domain and range
- * of the schedule constraints with the reaching domain elements instead.
- */
-static __isl_give isl_union_map *get_local_coincidence(
-	__isl_keep isl_schedule_node *node,
-	__isl_keep isl_schedule_constraints *sc)
-{
-	isl_union_map *coincidence;
-	isl_multi_union_pw_aff *prefix;
-	isl_union_pw_multi_aff *contraction;
-
-	coincidence = isl_schedule_constraints_get_coincidence(sc);
-	contraction = isl_schedule_node_get_subtree_contraction(node);
-	if (isl_schedule_node_get_schedule_depth(node) == 0) {
-		isl_union_set *domain;
-
-		domain = isl_schedule_node_get_domain(node);
-		domain = isl_union_set_preimage_union_pw_multi_aff(domain,
-						    contraction);
-		coincidence = isl_union_map_intersect_domain(coincidence,
-						    isl_union_set_copy(domain));
-		coincidence = isl_union_map_intersect_range(coincidence,
-						    domain);
-		return coincidence;
-	}
-
-	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
-	prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
-								contraction);
-	return isl_union_map_eq_at_multi_union_pw_aff(coincidence, prefix);
-}
-
-/* For each member in the band node "node", determine whether
- * it is coincident with respect to the outer nodes and mark
- * it accordingly.
- *
- * That is, for each coincidence constraint between pairs
- * of instances that are scheduled together by the outer nodes,
- * check that domain and range are assigned the same value
- * by the band member.  This test is performed by checking
- * that imposing the same value for the band member does not
- * remove any elements from the set of coincidence constraints.
- */
-static __isl_give isl_schedule_node *band_set_coincident(
-	__isl_take isl_schedule_node *node,
-	__isl_keep isl_schedule_constraints *sc)
-{
-	isl_union_map *coincidence;
-	isl_union_pw_multi_aff *contraction;
-	isl_multi_union_pw_aff *partial;
-	int i, n;
-
-	coincidence = get_local_coincidence(node, sc);
-
-	partial = isl_schedule_node_band_get_partial_schedule(node);
-	contraction = isl_schedule_node_get_subtree_contraction(node);
-	partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
-								contraction);
-	n = isl_schedule_node_band_n_member(node);
-	for (i = 0; i < n; ++i) {
-		isl_union_map *coincidence_i;
-		isl_union_pw_aff *upa;
-		isl_multi_union_pw_aff *partial_i;
-		int subset;
-
-		upa = isl_multi_union_pw_aff_get_union_pw_aff(partial, i);
-		partial_i = isl_multi_union_pw_aff_from_union_pw_aff(upa);
-		coincidence_i = isl_union_map_copy(coincidence);
-		coincidence_i = isl_union_map_eq_at_multi_union_pw_aff(
-						    coincidence_i, partial_i);
-		subset = isl_union_map_is_subset(coincidence, coincidence_i);
-		isl_union_map_free(coincidence_i);
-
-		if (subset < 0)
-			break;
-		node = isl_schedule_node_band_member_set_coincident(node, i,
-								    subset);
-	}
-	if (i < n)
-		node = isl_schedule_node_free(node);
-	isl_multi_union_pw_aff_free(partial);
-	isl_union_map_free(coincidence);
-
-	return node;
-}
-
-/* If "node" is a band, then set its properties.
- *
- * In particular, if the band has exactly one member, then mark it permutable.
- * Mark the band member coincident based on the coincidence constraints
- * of "sc".
- */
-static __isl_give isl_schedule_node *set_band_properties(
-	__isl_take isl_schedule_node *node, void *user)
-{
-	isl_schedule_constraints *sc = user;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		return node;
-	if (isl_schedule_node_band_n_member(node) == 0)
-		return node;
-
-	node = band_set_permutable(node, sc);
-	node = band_set_coincident(node, sc);
-
-	return node;
-}
-
-/* Return the original schedule with all bands marked permutable and
- * all band members marked coincident based on the coincidence constraints.
- * The bands are explicitly marked permutable so that they will be considered
- * by mark_outer_permutable.
- */
-static __isl_give isl_schedule *determine_properties_original_schedule(
-	struct gpu_gen *gen)
-{
-	isl_schedule *schedule;
-	isl_schedule_constraints *sc;
-
-	schedule = isl_schedule_copy(gen->prog->scop->schedule);
-	sc = construct_schedule_constraints(gen->prog);
-	schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
-						    &set_band_properties, sc);
-	isl_schedule_constraints_free(sc);
-
-	return schedule;
-}
-
-/* Compute a schedule or determine the properties of the original schedule
- * depending on the value of the "reschedule" option.
- */
-static __isl_give isl_schedule *compute_or_set_properties(void *user)
-{
-	struct gpu_gen *gen = user;
-
-	if (gen->options->reschedule)
-		return compute_schedule(gen);
-	else
-		return determine_properties_original_schedule(gen);
-}
-
-/* Obtain a schedule for the scop, by reading it from
- * a file, by computing one or by determining the properties
- * of the original schedule.
- */
-__isl_give isl_schedule *get_schedule(struct gpu_gen *gen)
-{
-	return ppcg_get_schedule(gen->ctx, gen->options,
-				&compute_or_set_properties, gen);
-}
-
-/* Construct the string "<a>_<b>".
- */
-static char *concat(isl_ctx *ctx, const char *a, const char *b)
-{
-	isl_printer *p;
-	char *s;
-
-	p = isl_printer_to_str(ctx);
-	p = isl_printer_print_str(p, a);
-	p = isl_printer_print_str(p, "_");
-	p = isl_printer_print_str(p, b);
-	s = isl_printer_get_str(p);
-	isl_printer_free(p);
-
-	return s;
-}
-
-/* For each array in "prog" of which an element appears in "accessed" and
- * that is not a read only scalar, create a zero-dimensional universe set
- * of which the tuple id has name "<prefix>_<name of array>" and a user
- * pointer pointing to the array (gpu_array_info).
- *
- * If the array is local to "prog", then make sure it will be declared
- * in the host code.
- *
- * Return the list of these universe sets.
- */
-static __isl_give isl_union_set_list *create_copy_filters(struct gpu_prog *prog,
-	const char *prefix, __isl_take isl_union_set *accessed)
-{
-	int i;
-	isl_ctx *ctx;
-	isl_union_set_list *filters;
-
-	ctx = prog->ctx;
-	filters = isl_union_set_list_alloc(ctx, 0);
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-		isl_space *space;
-		isl_set *accessed_i;
-		int empty;
-		char *name;
-		isl_id *id;
-		isl_union_set *uset;
-
-		if (gpu_array_is_read_only_scalar(array))
-			continue;
-
-		space = isl_space_copy(array->space);
-		accessed_i = isl_union_set_extract_set(accessed, space);
-		empty = isl_set_plain_is_empty(accessed_i);
-		isl_set_free(accessed_i);
-		if (empty < 0) {
-			filters = isl_union_set_list_free(filters);
-			break;
-		}
-		if (empty)
-			continue;
-
-		array->global = 1;
-		if (array->local)
-			array->declare_local = 1;
-
-		name = concat(ctx, prefix, array->name);
-		id = name ? isl_id_alloc(ctx, name, array) : NULL;
-		free(name);
-		space = isl_space_set_alloc(ctx, 0, 0);
-		space = isl_space_set_tuple_id(space, isl_dim_set, id);
-		uset = isl_union_set_from_set(isl_set_universe(space));
-
-		filters = isl_union_set_list_add(filters, uset);
-	}
-	isl_union_set_free(accessed);
-
-	return filters;
-}
-
-/* Make sure that code for the statements in "filters" that
- * copy arrays to or from the device is only generated when
- * the size of the corresponding array is positive.
- * That is, add a set node underneath "graft" with "filters" as children
- * and for each child add a guard that the selects the parameter
- * values for which the corresponding array has a positive size.
- * The array is available in the user pointer of the statement identifier.
- * "depth" is the schedule depth of the position where "graft"
- * will be added.
- */
-static __isl_give isl_schedule_node *insert_positive_size_guards(
-	__isl_take isl_schedule_node *graft,
-	__isl_take isl_union_set_list *filters, int depth)
-{
-	int i, n;
-
-	graft = isl_schedule_node_child(graft, 0);
-	graft = isl_schedule_node_insert_set(graft, filters);
-	n = isl_schedule_node_n_children(graft);
-	for (i = 0; i < n; ++i) {
-		isl_union_set *filter;
-		isl_set *domain, *guard;
-		isl_id *id;
-		struct gpu_array_info *array;
-
-		graft = isl_schedule_node_child(graft, i);
-		filter = isl_schedule_node_filter_get_filter(graft);
-		domain = isl_set_from_union_set(filter);
-		id = isl_set_get_tuple_id(domain);
-		array = isl_id_get_user(id);
-		isl_id_free(id);
-		isl_set_free(domain);
-		guard = gpu_array_positive_size_guard(array);
-		guard = isl_set_from_params(guard);
-		guard = isl_set_add_dims(guard, isl_dim_set, depth);
-		graft = isl_schedule_node_child(graft, 0);
-		graft = isl_schedule_node_insert_guard(graft, guard);
-		graft = isl_schedule_node_parent(graft);
-		graft = isl_schedule_node_parent(graft);
-	}
-	graft = isl_schedule_node_parent(graft);
-
-	return graft;
-}
-
-/* Create a graft for copying arrays to or from the device,
- * whenever the size of the array is strictly positive.
- * Each statement is called "<prefix>_<name of array>" and
- * the identifier has a user pointer pointing to the array.
- * The graft will be added at the position specified by "node".
- * "copy" contains the array elements that need to be copied.
- * Only arrays of which some elements need to be copied
- * will have a corresponding statement in the graph.
- * Note though that each such statement will copy the entire array.
- */
-static __isl_give isl_schedule_node *create_copy_device(struct gpu_prog *prog,
-	__isl_keep isl_schedule_node *node, const char *prefix,
-	__isl_take isl_union_set *copy)
-{
-	int depth;
-	isl_ctx *ctx;
-	isl_space *space;
-	isl_union_set *all, *domain;
-	isl_union_set_list *filters;
-	isl_union_map *extension;
-	isl_schedule_node *graft;
-
-	ctx = prog->ctx;
-	depth = isl_schedule_node_get_schedule_depth(node);
-	filters = create_copy_filters(prog, prefix, copy);
-	all = isl_union_set_list_union(isl_union_set_list_copy(filters));
-
-	space = depth < 0 ? NULL : isl_space_set_alloc(ctx, 0, depth);
-	domain = isl_union_set_from_set(isl_set_universe(space));
-	extension = isl_union_map_from_domain_and_range(domain, all);
-	graft = isl_schedule_node_from_extension(extension);
-
-	if (!filters)
-		return isl_schedule_node_free(graft);
-	if (isl_union_set_list_n_union_set(filters) == 0) {
-		isl_union_set_list_free(filters);
-		return graft;
-	}
-
-	return insert_positive_size_guards(graft, filters, depth);
-}
-
-/* Return (the universe spaces of) the arrays that are declared
- * inside the scop corresponding to "prog" and for which all
- * potential writes inside the scop form a subset of "domain".
- */
-static __isl_give isl_union_set *extract_local_accesses(struct gpu_prog *prog,
-	__isl_keep isl_union_set *domain)
-{
-	int i;
-	isl_union_set *local;
-
-	local = isl_union_set_empty(isl_union_set_get_space(domain));
-
-	for (i = 0; i < prog->n_array; ++i) {
-		isl_set *set;
-		isl_union_map *to_outer;
-		isl_union_map *may_write;
-		isl_union_set *write_domain;
-		isl_union_set *fields;
-		int subset;
-
-		if (!prog->array[i].local)
-			continue;
-
-		set = isl_set_universe(isl_space_copy(prog->array[i].space));
-		to_outer = isl_union_map_copy(prog->to_outer);
-		to_outer = isl_union_map_intersect_range(to_outer,
-				    isl_union_set_from_set(isl_set_copy(set)));
-		fields = isl_union_map_domain(to_outer);
-		may_write = isl_union_map_copy(prog->may_write);
-		may_write = isl_union_map_intersect_range(may_write, fields);
-		write_domain = isl_union_map_domain(may_write);
-		subset = isl_union_set_is_subset(write_domain, domain);
-		isl_union_set_free(write_domain);
-
-		if (subset < 0) {
-			isl_set_free(set);
-			return isl_union_set_free(local);
-		} else if (subset) {
-			local = isl_union_set_add_set(local, set);
-		} else {
-			isl_set_free(set);
-		}
-	}
-
-	return local;
-}
-
-/* Internal data structure for node_may_persist.
- *
- * "tagger" maps tagged iteration domains to the corresponding untagged
- *	iteration domain.
- *
- * "may_persist_flow" is the set of all tagged dataflow dependences
- * with those dependences removed that either precede or follow
- * the kernel launch in a sequence.
- * "inner_band_flow" is the set of all tagged dataflow dependences
- * that are local to a given iteration of the outer band nodes
- * with respect to the current node.
- * "local_flow" is equal to "inner_band_flow", except that the domain
- * and the range have been intersected with intermediate filters
- * on children of sets or sequences.
- */
-struct ppcg_may_persist_data {
-	isl_union_pw_multi_aff *tagger;
-
-	isl_union_map *local_flow;
-	isl_union_map *inner_band_flow;
-	isl_union_map *may_persist_flow;
-};
-
-/* Update the information in "data" based on the band ancestor "node".
- *
- * In particular, we restrict the dependences in data->local_flow
- * to those dependence where the source and the sink occur in
- * the same iteration of the given band node.
- * We also update data->inner_band_flow to the new value of
- * data->local_flow.
- */
-static int update_may_persist_at_band(__isl_keep isl_schedule_node *node,
-	struct ppcg_may_persist_data *data)
-{
-	isl_multi_union_pw_aff *partial;
-	isl_union_pw_multi_aff *contraction;
-	isl_union_map *flow;
-
-	if (isl_schedule_node_band_n_member(node) == 0)
-		return 0;
-
-	partial = isl_schedule_node_band_get_partial_schedule(node);
-	contraction = isl_schedule_node_get_subtree_contraction(node);
-	partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
-								contraction);
-	partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
-				isl_union_pw_multi_aff_copy(data->tagger));
-
-	flow = data->local_flow;
-	flow = isl_union_map_eq_at_multi_union_pw_aff(flow, partial);
-	data->local_flow = flow;
-
-	isl_union_map_free(data->inner_band_flow);
-	data->inner_band_flow = isl_union_map_copy(data->local_flow);
-
-	return 0;
-}
-
-/* Given a set of local reaching domain elements "domain",
- * expand them to the corresponding leaf domain elements using "contraction"
- * and insert the array references tags using data->tagger.
- */
-static __isl_give isl_union_set *expand_and_tag(
-	__isl_take isl_union_set *domain,
-	__isl_take isl_union_pw_multi_aff *contraction,
-	struct ppcg_may_persist_data *data)
-{
-	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
-			    contraction);
-	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
-			    isl_union_pw_multi_aff_copy(data->tagger));
-	return domain;
-}
-
-/* Given a filter node that is the child of a set or sequence node,
- * restrict data->local_flow to refer only to those elements
- * in the filter of the node.
- * "contraction" maps the leaf domain elements of the schedule tree
- * to the corresponding domain elements at (the parent of) "node".
- */
-static int filter_flow(__isl_keep isl_schedule_node *node,
-	struct ppcg_may_persist_data *data,
-	__isl_take isl_union_pw_multi_aff *contraction)
-{
-	isl_union_set *filter;
-	isl_union_map *flow;
-
-	flow = data->local_flow;
-	filter = isl_schedule_node_filter_get_filter(node);
-	filter = expand_and_tag(filter, contraction, data);
-	flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(filter));
-	flow = isl_union_map_intersect_range(flow, filter);
-	data->local_flow = flow;
-
-	return 0;
-}
-
-/* Given a filter node "node", collect the filters on all preceding siblings
- * (which are also filter nodes), add them to "filters" and return the result.
- */
-static __isl_give isl_union_set *add_previous_filters(
-	__isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node)
-{
-	isl_schedule_node *sibling;
-
-	sibling = isl_schedule_node_copy(node);
-	while (sibling && isl_schedule_node_has_previous_sibling(sibling)) {
-		isl_union_set *filter;
-
-		sibling = isl_schedule_node_previous_sibling(sibling);
-		filter = isl_schedule_node_filter_get_filter(sibling);
-		filters = isl_union_set_union(filters, filter);
-	}
-	isl_schedule_node_free(sibling);
-	if (!sibling)
-		return isl_union_set_free(filters);
-
-	return filters;
-}
-
-/* Given a filter node "node", collect the filters on all following siblings
- * (which are also filter nodes), add them to "filters" and return the result.
- */
-static __isl_give isl_union_set *add_next_filters(
-	__isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node)
-{
-	isl_schedule_node *sibling;
-
-	sibling = isl_schedule_node_copy(node);
-	while (sibling && isl_schedule_node_has_next_sibling(sibling)) {
-		isl_union_set *filter;
-
-		sibling = isl_schedule_node_next_sibling(sibling);
-		filter = isl_schedule_node_filter_get_filter(sibling);
-		filters = isl_union_set_union(filters, filter);
-	}
-	isl_schedule_node_free(sibling);
-	if (!sibling)
-		return isl_union_set_free(filters);
-
-	return filters;
-}
-
-/* Remove those flow dependences from data->may_persist_flow
- * that flow between elements of "domain" within the same iteration
- * of all outer band nodes.
- * "contraction" maps the leaf domain elements of the schedule tree
- * to the corresponding elements "domain".
- */
-static void remove_external_flow(struct ppcg_may_persist_data *data,
-	__isl_take isl_union_set *domain,
-	__isl_keep isl_union_pw_multi_aff *contraction)
-{
-	isl_union_map *flow;
-
-	contraction = isl_union_pw_multi_aff_copy(contraction);
-	domain = expand_and_tag(domain, contraction, data);
-	flow = isl_union_map_copy(data->local_flow);
-	flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(domain));
-	flow = isl_union_map_intersect_range(flow, domain);
-
-	data->may_persist_flow = isl_union_map_subtract(data->may_persist_flow,
-							flow);
-}
-
-/* Update the information in "data" based on the filter ancestor "node".
- * We only need to modify anything if the filter is the child
- * of a set or sequence node.
- *
- * In the case of a sequence, we remove the dependences between
- * statement instances that are both executed either before or
- * after the subtree that will be mapped to a kernel, within
- * the same iteration of outer bands.
- *
- * In both cases, we restrict data->local_flow to the current child.
- */
-static int update_may_persist_at_filter(__isl_keep isl_schedule_node *node,
-	struct ppcg_may_persist_data *data)
-{
-	enum isl_schedule_node_type type;
-	isl_schedule_node *parent;
-	isl_space *space;
-	isl_union_pw_multi_aff *contraction;
-	isl_union_set *before, *after, *filter;
-
-	type = isl_schedule_node_get_parent_type(node);
-	if (type != isl_schedule_node_sequence && type != isl_schedule_node_set)
-		return 0;
-
-	parent = isl_schedule_node_copy(node);
-	parent = isl_schedule_node_parent(parent);
-	contraction = isl_schedule_node_get_subtree_contraction(parent);
-	isl_schedule_node_free(parent);
-
-	if (type == isl_schedule_node_set)
-		return filter_flow(node, data, contraction);
-
-	filter = isl_schedule_node_filter_get_filter(node);
-	space = isl_union_set_get_space(filter);
-	isl_union_set_free(filter);
-	before = isl_union_set_empty(space);
-	after = isl_union_set_copy(before);
-	before = add_previous_filters(before, node);
-	after = add_next_filters(after, node);
-
-	remove_external_flow(data, before, contraction);
-	remove_external_flow(data, after, contraction);
-
-	return filter_flow(node, data, contraction);
-}
-
-/* Update the information in "data" based on the ancestor "node".
- */
-static isl_stat update_may_persist_at(__isl_keep isl_schedule_node *node,
-	void *user)
-{
-	struct ppcg_may_persist_data *data = user;
-
-	switch (isl_schedule_node_get_type(node)) {
-	case isl_schedule_node_error:
-		return isl_stat_error;
-	case isl_schedule_node_context:
-	case isl_schedule_node_domain:
-	case isl_schedule_node_expansion:
-	case isl_schedule_node_extension:
-	case isl_schedule_node_guard:
-	case isl_schedule_node_leaf:
-	case isl_schedule_node_mark:
-	case isl_schedule_node_sequence:
-	case isl_schedule_node_set:
-		break;
-	case isl_schedule_node_band:
-		if (update_may_persist_at_band(node, data) < 0)
-			return isl_stat_error;
-		break;
-	case isl_schedule_node_filter:
-		if (update_may_persist_at_filter(node, data) < 0)
-			return isl_stat_error;
-		break;
-	}
-
-	return isl_stat_ok;
-}
-
-/* Determine the set of array elements that may need to be perserved
- * by a kernel constructed from the subtree at "node".
- * This includes the set of array elements that may need to be preserved
- * by the entire scop (prog->may_persist) and the elements for which
- * there is a potential flow dependence that may cross a kernel launch.
- *
- * To determine the second set, we start from all flow dependences.
- * From this set of dependences, we remove those that cannot possibly
- * require data to be preserved by a kernel launch.
- * In particular, we consider the following sets of dependences.
- * - dependences of which the write occurs inside the kernel.
- *   If the data is needed outside the kernel, then it will
- *   be copied out immediately after the kernel launch, so there
- *   is no need for any special care.
- * - dependences of which the read occurs inside the kernel and the
- *   corresponding write occurs inside the same iteration of the
- *   outer band nodes.  This means that the data is needed in
- *   the first kernel launch after the write, which is already
- *   taken care of by the standard copy-in.  That is, the data
- *   do not need to be preserved by any intermediate call to
- *   the same kernel.
- * - dependences of which the write and the read either both occur
- *   before the kernel launch or both occur after the kernel launch,
- *   within the same iteration of the outer band nodes with respect
- *   to the sequence that determines the ordering of the dependence
- *   and the kernel launch.  Such flow dependences cannot cross
- *   any kernel launch.
- *
- * For the remaining (tagged) dependences, we take the domain
- * (i.e., the tagged writes) and apply the tagged access relation
- * to obtain the accessed data elements.
- * These are then combined with the elements that may need to be
- * preserved by the entire scop.
- */
-static __isl_give isl_union_set *node_may_persist(
-	__isl_keep isl_schedule_node *node, struct gpu_prog *prog)
-{
-	struct ppcg_may_persist_data data;
-	isl_union_pw_multi_aff *contraction;
-	isl_union_set *domain;
-	isl_union_set *persist;
-	isl_union_map *flow, *local_flow;
-
-	data.tagger = prog->scop->tagger;
-
-	flow = isl_union_map_copy(prog->scop->tagged_dep_flow);
-	data.local_flow = isl_union_map_copy(flow);
-	data.inner_band_flow = isl_union_map_copy(flow);
-	data.may_persist_flow = flow;
-	if (isl_schedule_node_foreach_ancestor_top_down(node,
-					&update_may_persist_at, &data) < 0)
-		data.may_persist_flow =
-				    isl_union_map_free(data.may_persist_flow);
-	flow = data.may_persist_flow;
-	isl_union_map_free(data.local_flow);
-
-	domain = isl_schedule_node_get_domain(node);
-	contraction = isl_schedule_node_get_subtree_contraction(node);
-	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
-				    contraction);
-	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
-				    isl_union_pw_multi_aff_copy(data.tagger));
-	flow = isl_union_map_subtract_domain(flow, isl_union_set_copy(domain));
-	local_flow = data.inner_band_flow;
-	local_flow = isl_union_map_intersect_range(local_flow, domain);
-	flow = isl_union_map_subtract(flow, local_flow);
-
-	persist = isl_union_map_domain(flow);
-	persist = isl_union_set_apply(persist,
-			isl_union_map_copy(prog->scop->tagged_may_writes));
-	persist = isl_union_set_union(persist,
-			isl_union_set_copy(prog->may_persist));
-
-	return persist;
-}
-
-/* Add nodes for copying outer arrays in and out of the device
- * before and after the subtree "node", which contains one or more kernels.
- * "domain" contains the original statement instances, i.e.,
- * those that correspond to the domains of the access relations in "prog".
- * In particular, the domain has not been contracted in any way.
- * "prefix" contains the prefix schedule at that point, in terms
- * of the same original statement instances.
- *
- * We first compute the sets of outer array elements that need
- * to be copied in and out and then graft in the nodes for
- * performing this copying.
- *
- * In particular, for each array that is possibly written anywhere in
- * the subtree "node" and that may be used after "node"
- * or that may be visible outside the corresponding scop,
- * we copy out its entire extent.
- *
- * Any array elements that is read without first being written inside
- * the subtree "node" needs to be copied in.
- * Furthermore, if there are any array elements that
- * are copied out, but that may not be written inside "node, then
- * they also need to be copied in to ensure that the value after execution
- * is the same as the value before execution, at least for those array
- * elements that may have their values preserved by the scop or that
- * may be written before "node" and read after "node".
- * In case the array elements are structures, we need to take into
- * account that all members of the structures need to be written
- * by "node" before we can avoid copying the data structure in.
- *
- * Note that the may_write relation is intersected with the domain,
- * which has been intersected with the context.
- * This helps in those cases where the arrays are declared with a fixed size,
- * while the accesses are parametric and the context assigns a fixed value
- * to the parameters.
- *
- * If an element from a local array is read without first being written,
- * then there is no point in copying it in since it cannot have been
- * written prior to the scop.  Warn about the uninitialized read instead.
- */
-static __isl_give isl_schedule_node *add_to_from_device(
-	__isl_take isl_schedule_node *node, __isl_take isl_union_set *domain,
-	__isl_take isl_union_map *prefix, struct gpu_prog *prog)
-{
-	isl_union_set *local;
-	isl_union_set *may_persist;
-	isl_union_map *may_write, *must_write, *copy_out, *not_written;
-	isl_union_map *read, *copy_in;
-	isl_union_map *tagged;
-	isl_union_map *local_uninitialized;
-	isl_schedule_node *graft;
-
-	tagged = isl_union_map_copy(prog->scop->tagged_reads);
-	tagged = isl_union_map_union(tagged,
-			    isl_union_map_copy(prog->scop->tagged_may_writes));
-
-	may_write = isl_union_map_copy(prog->may_write);
-	may_write = isl_union_map_intersect_domain(may_write,
-					isl_union_set_copy(domain));
-	may_write = remove_local_accesses(prog,
-					isl_union_map_copy(tagged), may_write,
-					isl_union_map_copy(prefix), 0);
-	may_write = isl_union_map_apply_range(may_write,
-					isl_union_map_copy(prog->to_outer));
-	may_write = isl_union_map_apply_domain(may_write,
-					isl_union_map_copy(prefix));
-	may_write = approximate_copy_out(may_write, prog);
-	copy_out = isl_union_map_copy(may_write);
-	may_write = isl_union_map_apply_range(may_write,
-					isl_union_map_copy(prog->to_inner));
-	must_write = isl_union_map_copy(prog->must_write);
-	must_write = isl_union_map_apply_domain(must_write,
-					isl_union_map_copy(prefix));
-	may_persist = node_may_persist(node, prog);
-	may_write = isl_union_map_intersect_range(may_write, may_persist);
-	not_written = isl_union_map_subtract(may_write, must_write);
-
-	local = extract_local_accesses(prog, domain);
-	read = isl_union_map_copy(prog->read);
-	read = isl_union_map_intersect_domain(read, domain);
-	read = remove_local_accesses(prog, tagged, read,
-					isl_union_map_copy(prefix), 1);
-	local = isl_union_set_apply(local, isl_union_map_copy(prog->to_inner));
-	local_uninitialized = isl_union_map_copy(prog->scop->live_in);
-	local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
-							    local);
-	local_uninitialized = isl_union_map_intersect(local_uninitialized,
-						    isl_union_map_copy(read));
-	if (!isl_union_map_is_empty(local_uninitialized)) {
-		fprintf(stderr,
-			"possibly uninitialized reads (not copied in):\n");
-		isl_union_map_dump(local_uninitialized);
-	}
-	read = isl_union_map_subtract(read, local_uninitialized);
-	read = isl_union_map_apply_domain(read, prefix);
-	copy_in = isl_union_map_union(read, not_written);
-	copy_in = isl_union_map_apply_range(copy_in,
-				    isl_union_map_copy(prog->to_outer));
-
-	graft = create_copy_device(prog, node, "to_device",
-						isl_union_map_range(copy_in));
-	node = isl_schedule_node_graft_before(node, graft);
-	graft = create_copy_device(prog, node, "from_device",
-						isl_union_map_range(copy_out));
-	node = isl_schedule_node_graft_after(node, graft);
-
-	return node;
-}
-
-/* Add nodes for initializing ("init_device") and clearing ("clear_device")
- * the device before and after "node".
- */
-static __isl_give isl_schedule_node *add_init_clear_device(
-	__isl_take isl_schedule_node *node)
-{
-	isl_ctx *ctx;
-	isl_space *space;
-	isl_union_set *domain;
-	isl_schedule_node *graft;
-
-	ctx = isl_schedule_node_get_ctx(node);
-
-	space = isl_space_set_alloc(ctx, 0, 0);
-	space = isl_space_set_tuple_name(space, isl_dim_set, "init_device");
-	domain = isl_union_set_from_set(isl_set_universe(space));
-	graft = isl_schedule_node_from_domain(domain);
-
-	node = isl_schedule_node_graft_before(node, graft);
-
-	space = isl_space_set_alloc(ctx, 0, 0);
-	space = isl_space_set_tuple_name(space, isl_dim_set, "clear_device");
-	domain = isl_union_set_from_set(isl_set_universe(space));
-	graft = isl_schedule_node_from_domain(domain);
-
-	node = isl_schedule_node_graft_after(node, graft);
-
-	return node;
-}
-
-/* Update "schedule" for mapping to a GPU device.
- *
- * In particular, insert a context node, create kernels for
- * each outermost tilable band and introduce nodes for copying arrays
- * in and out of the device and for initializing and clearing the device.
- * If the child of the initial root points to a set node,
- * then children of this node that do not contain any tilable bands
- * are separated from the other children and are not mapped to
- * the device.
- *
- * The GPU code is generated in a context where at least one
- * statement instance is executed.  The corresponding guard is inserted
- * around the entire schedule.
- */
-__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
-	__isl_take isl_schedule *schedule, int to_from_device)
-{
-	isl_schedule_node *node;
-	isl_set *context;
-	isl_set *guard;
-	isl_union_set *domain;
-	isl_union_map *prefix;
-	isl_union_pw_multi_aff *contraction;
-	struct gpu_prog *prog;
-
-	context = isl_set_copy(gen->prog->context);
-	context = isl_set_from_params(context);
-	schedule = isl_schedule_insert_context(schedule, context);
-
-	prog = gen->prog;
-	guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
-	prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
-	guard = isl_set_from_params(guard);
-
-	node = isl_schedule_get_root(schedule);
-	isl_schedule_free(schedule);
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_child(node, 0);
-	node = isolate_permutable_subtrees(node, gen->prog);
-	domain = isl_schedule_node_get_domain(node);
-	contraction = isl_schedule_node_get_subtree_contraction(node);
-	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
-				    isl_union_pw_multi_aff_copy(contraction));
-	prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
-	prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
-				    contraction);
-	node = mark_kernels(gen, node);
-	if (to_from_device) {
-		node = add_to_from_device(node, domain, prefix, gen->prog);
-	} else {
-		isl_union_set_free(domain);
-		isl_union_map_free(prefix);
-	}
-	node = isl_schedule_node_root(node);
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_insert_guard(node, guard);
-	node = isl_schedule_node_child(node, 0);
-	node = add_init_clear_device(node);
-	schedule = isl_schedule_node_get_schedule(node);
-	isl_schedule_node_free(node);
-
-	return schedule;
-}
-
-/* Internal data structure for extract_access.
- * "next_access" points to the end of a linked list that is extended
- * by extract_access.
- * "single_expression" is set if the access expressions belong to
- * an expression statement (i.e., a statement without internal control).
- * "any_to_outer" maps all intermediate arrays to their outer arrays.
- */
-struct ppcg_extract_access_data {
-	struct gpu_stmt_access **next_access;
-	int single_expression;
-	isl_union_map *any_to_outer;
-};
-
-/* Given a tagged access relation to a single array "tagged", extract it
- * as a map, taking into account that the input may be empty.
- * If the access relation is empty, then it does not contain
- * any space information, so we try to recover it from the index
- * expression.
- * The space of the index expression is of the form I -> A,
- * with I the statement instances and A the array, or [I -> F] -> A,
- * with F the filters corresponding to arguments.
- * We first drop F, if present, obtaining I -> A.
- * Then we construct I -> R, with R the reference tag,
- * combine the two into I -> [R -> A] and uncurry to obtain
- * the final result [I -> R] -> A.
- * Note that the index expression may have a lower dimension
- * than that of the array, but this dimension is not used
- * if the access relation is empty.
- */
-static __isl_give isl_map *extract_single_tagged_access(
-	__isl_take isl_union_map *tagged, __isl_keep pet_expr *expr)
-{
-	int empty;
-	isl_id *id;
-	isl_space *space, *space2;
-	isl_multi_pw_aff *index;
-
-	empty = isl_union_map_is_empty(tagged);
-	if (empty < 0)
-		goto error;
-	if (!empty)
-		return isl_map_from_union_map(tagged);
-	isl_union_map_free(tagged);
-
-	index = pet_expr_access_get_index(expr);
-	space = isl_multi_pw_aff_get_space(index);
-	isl_multi_pw_aff_free(index);
-	if (isl_space_domain_is_wrapping(space))
-		space = isl_space_domain_factor_domain(space);
-	space2 = isl_space_copy(space);
-	space2 = isl_space_from_domain(isl_space_domain(space));
-	id = pet_expr_access_get_ref_id(expr);
-	space2 = isl_space_set_tuple_id(space2, isl_dim_out, id);
-	space = isl_space_range_product(space2, space);
-	space = isl_space_uncurry(space);
-
-	return isl_map_empty(space);
-error:
-	isl_union_map_free(tagged);
-	return NULL;
-}
-
-/* Does the index expression "index" of "expr" represent an access
- * to a single element?
- * That is, is "index" completely specified?
- *
- * If "expr" accesses elements from different spaces (i.e., fields
- * of a structure), then it does not access a single element.
- * Otherwise, if the single space of the access matches the space
- * of "index", then the index expression is completely specified
- * (no pointer to a lower-dimensional slice of the accessed array)
- * and a single element is being accessed.
- */
-static isl_bool complete_index(__isl_keep pet_expr *expr,
-	__isl_keep isl_multi_pw_aff *index)
-{
-	isl_union_map *read, *write, *all;
-	isl_map *map;
-	isl_space *space1, *space2;
-	isl_bool complete;
-
-	read = pet_expr_access_get_may_read(expr);
-	write = pet_expr_access_get_may_write(expr);
-	all = isl_union_map_union(read, write);
-	if (!all)
-		return isl_bool_error;
-	if (isl_union_map_n_map(all) != 1) {
-		isl_union_map_free(all);
-		return isl_bool_false;
-	}
-	map = isl_map_from_union_map(all);
-	space1 = isl_map_get_space(map);
-	isl_map_free(map);
-	space2 = isl_multi_pw_aff_get_space(index);
-	complete = isl_space_tuple_is_equal(space1, isl_dim_out,
-					    space2, isl_dim_out);
-	isl_space_free(space1);
-	isl_space_free(space2);
-
-	return complete;
-}
-
-/* Does "expr" access a single, fixed element (independently of the statement
- * instance)?
- * That is, does it have a completely specified constant index expression?
- *
- * Note that it is not sufficient for the index expression to be
- * piecewise constant.  isl_multi_pw_aff_is_cst can therefore not be used.
- */
-static isl_bool accesses_fixed_element(__isl_keep pet_expr *expr)
-{
-	int i, n;
-	isl_multi_pw_aff *index;
-	isl_bool fixed = isl_bool_true;
-
-	index = pet_expr_access_get_index(expr);
-	if (index < 0)
-		return isl_bool_error;
-	n = isl_multi_pw_aff_dim(index, isl_dim_out);
-	for (i = 0; i < n; ++i) {
-		isl_pw_aff *pa;
-
-		pa = isl_multi_pw_aff_get_pw_aff(index, 0);
-		fixed = isl_pw_aff_n_piece(pa) == 1;
-		if (fixed)
-			fixed = isl_pw_aff_is_cst(pa);
-		isl_pw_aff_free(pa);
-		if (fixed < 0 || !fixed)
-			break;
-	}
-	if (fixed >= 0 && fixed)
-		fixed = complete_index(expr, index);
-	isl_multi_pw_aff_free(index);
-
-	return fixed;
-}
-
-/* Extract a gpu_stmt_access from "expr", append it to the list
- * that ends in *data->next_access and update the end of the list.
- * If the access expression performs a write, then it is considered
- * exact only if it appears in a single expression statement and
- * if its may access relation is equal to its must access relation.
- *
- * The combined set of may accesses may be a union if member accesses
- * are involved, but the entire set is derived from a single reference and
- * therefore from a single index expression.  These accesses therefore
- * all map to the same outer array.
- */
-static int extract_access(__isl_keep pet_expr *expr, void *user)
-{
-	struct ppcg_extract_access_data *data = user;
-	isl_union_map *tagged;
-	struct gpu_stmt_access *access;
-	isl_ctx *ctx = pet_expr_get_ctx(expr);
-	isl_multi_pw_aff *index;
-
-	access = isl_alloc_type(ctx, struct gpu_stmt_access);
-	assert(access);
-	access->next = NULL;
-	access->read = pet_expr_access_is_read(expr);
-	access->write = pet_expr_access_is_write(expr);
-	tagged = pet_expr_access_get_tagged_may_read(expr);
-	tagged = isl_union_map_union(tagged,
-				pet_expr_access_get_tagged_may_write(expr));
-	tagged = isl_union_map_apply_range(tagged,
-					isl_union_map_copy(data->any_to_outer));
-	if (!access->write) {
-		access->exact_write = 1;
-	} else if (!data->single_expression) {
-		access->exact_write = 0;
-	} else {
-		isl_union_map *must, *may;
-		may = isl_union_map_copy(tagged);
-		may = isl_union_map_domain_factor_domain(may);
-		must = pet_expr_access_get_must_write(expr);
-		access->exact_write = isl_union_map_is_equal(must, may);
-		isl_union_map_free(must);
-		isl_union_map_free(may);
-	}
-	index = pet_expr_access_get_index(expr);
-	access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out);
-	isl_multi_pw_aff_free(index);
-	access->ref_id = pet_expr_access_get_ref_id(expr);
-	access->tagged_access = extract_single_tagged_access(tagged, expr);
-	access->access = isl_map_copy(access->tagged_access);
-	access->access = isl_map_domain_factor_domain(access->access);
-	access->fixed_element = accesses_fixed_element(expr);
-
-	*data->next_access = access;
-	data->next_access = &(*data->next_access)->next;
-
-	if (!access->access || access->fixed_element < 0)
-		return -1;
-
-	return 0;
-}
-
-/* Construct a linked list of gpu_stmt_access objects,
- * one for each access expression in the statement body.
- * "any_to_outer" maps all intermediate arrays to their outer arrays.
- */
-static int pet_stmt_extract_accesses(struct gpu_stmt *stmt,
-	__isl_keep isl_union_map *any_to_outer)
-{
-	struct ppcg_extract_access_data data;
-
-	stmt->accesses = NULL;
-	data.next_access = &stmt->accesses;
-	data.single_expression =
-		pet_tree_get_type(stmt->stmt->body) == pet_tree_expr;
-	data.any_to_outer = any_to_outer;
-	return pet_tree_foreach_access_expr(stmt->stmt->body,
-						&extract_access, &data);
-}
-
-/* Has statement "stmt" been killed from "scop"?
- * That is, is the instance set of "scop" free from any
- * instances of "stmt"?
- */
-static isl_bool is_stmt_killed(struct ppcg_scop *scop, struct pet_stmt *stmt)
-{
-	isl_space *space;
-	isl_set *left;
-	isl_bool empty;
-
-	if (!scop || !stmt)
-		return isl_bool_error;
-	space = isl_set_get_space(stmt->domain);
-	left = isl_union_set_extract_set(scop->domain, space);
-	empty = isl_set_plain_is_empty(left);
-	isl_set_free(left);
-
-	return empty;
-}
-
-/* Return an array of gpu_stmt representing the statements in "scop".
- * Do not collect array accesses for statements that have been killed.
- */
-static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
-	__isl_keep isl_union_map *any_to_outer)
-{
-	int i;
-	struct gpu_stmt *stmts;
-
-	stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->pet->n_stmt);
-	if (!stmts)
-		return NULL;
-
-	for (i = 0; i < scop->pet->n_stmt; ++i) {
-		struct gpu_stmt *s = &stmts[i];
-		isl_bool killed;
-
-		s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain);
-		s->stmt = scop->pet->stmts[i];
-		killed = is_stmt_killed(scop, scop->pet->stmts[i]);
-		if (killed < 0)
-			return free_stmts(stmts, i + 1);
-		if (killed)
-			continue;
-		if (pet_stmt_extract_accesses(s, any_to_outer) < 0)
-			return free_stmts(stmts, i + 1);
-	}
-
-	return stmts;
-}
-
-/* Generate CUDA code for "scop" and print it to "p".
- * After generating an AST for the transformed scop as explained below,
- * we call "gen->print" to print the AST in the desired output format
- * to "p".
- *
- * If it turns out that it does not make sense to generate GPU code,
- * then we generate CPU code instead.
- *
- * The declarations of the arrays that are visible outside of the scop
- * are printed outside of the code generated from the schedule,
- * because the generated code may involve a guard around the entire code.
- *
- * We first compute a schedule that respects the dependences
- * of the original program and select the outermost bands
- * of tilable dimensions that have at least one parallel loop.
- * If the --load-schedule is specified, then the loaded schedule
- * is used instead of a computed schedule.
- *
- * Each of these bands B is then tiled according to "tile" sizes, resulting
- * in two nested bands, with a kernel marker on top
- *
- *		K
- *		|
- *		T
- *		|
- *		P
- *
- * We then split off at most 2 parallel dimensions from the T band and
- * at most 3 parallel dimension from the P band
- *
- *		K
- *		|
- *		T
- *		T1
- *		|
- *		T2
- *		|
- *		P1
- *		|
- *		P2
- *
- * A filter is introduced in front of T1 that maps the domain instances
- * to block identifiers.  Similarly, a filter is introduced in front of P1
- * that maps the domain instances to thread identifiers.
- *
- * For each iteration of the T2 band and for each array, we compute
- * the array elements accessed by that iteration, construct a rectangular
- * box around it and shift it to the origin.  The result is used
- * as shared memory for the array.
- *
- * Copying and synchronization statements are added to this schedule tree.
- * In principle, these are added in front of the P1 band, but some of
- * them may get hoisted up to higher levels.
- *
- * The entire AST is then generated from the single resulting schedule tree.
- * During the generation the subtrees at kernel nodes (K) are saved
- * aside and replaced by kernel calls.  The result is printed as host code
- * while the saved subtrees are printed as device code.
- */
-static __isl_give isl_printer *generate(__isl_take isl_printer *p,
-	struct gpu_gen *gen, struct ppcg_scop *scop,
-	struct ppcg_options *options)
-{
-	struct gpu_prog *prog;
-	isl_ctx *ctx;
-	isl_schedule *schedule;
-	int any_permutable;
-
-	if (!scop)
-		return isl_printer_free(p);
-
-	ctx = isl_printer_get_ctx(p);
-	prog = gpu_prog_alloc(ctx, scop);
-	if (!prog)
-		return isl_printer_free(p);
-
-	gen->prog = prog;
-	schedule = get_schedule(gen);
-
-	any_permutable = has_any_permutable_node(schedule);
-	if (any_permutable < 0 || !any_permutable) {
-		if (any_permutable < 0)
-			p = isl_printer_free(p);
-		else
-			p = print_cpu(p, scop, options);
-		isl_schedule_free(schedule);
-	} else {
-		const int create_to_from_device = 1;
-		schedule = map_to_device(gen, schedule, create_to_from_device);
-		gen->tree = generate_code(gen, schedule);
-		p = ppcg_set_macro_names(p);
-		p = ppcg_print_exposed_declarations(p, prog->scop);
-		p = gen->print(p, gen->prog, gen->tree, &gen->types,
-				    gen->print_user);
-		isl_ast_node_free(gen->tree);
-	}
-
-	gpu_prog_free(prog);
-
-	return p;
-}
-
-/* Wrapper around generate for use as a ppcg_transform callback.
- */
-static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
-	struct ppcg_scop *scop, void *user)
-{
-	struct gpu_gen *gen = user;
-
-	return generate(p, gen, scop, gen->options);
-}
-
-/* Transform the code in the file called "input" by replacing
- * all scops by corresponding GPU code and write the results to "out".
- */
-int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
-	struct ppcg_options *options,
-	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
-		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-		struct gpu_types *types, void *user), void *user)
-{
-	struct gpu_gen gen;
-	int r;
-	int i;
-
-	gen.ctx = ctx;
-	gen.sizes = extract_sizes_from_str(ctx, options->sizes);
-	gen.options = options;
-	gen.kernel_id = 0;
-	gen.print = print;
-	gen.print_user = user;
-	gen.types.n = 0;
-	gen.types.name = NULL;
-
-	if (options->debug->dump_sizes) {
-		isl_space *space = isl_space_params_alloc(ctx, 0);
-		gen.used_sizes = isl_union_map_empty(space);
-	}
-
-	r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
-
-	if (options->debug->dump_sizes) {
-		isl_union_map_dump(gen.used_sizes);
-		isl_union_map_free(gen.used_sizes);
-	}
-
-	isl_union_map_free(gen.sizes);
-	for (i = 0; i < gen.types.n; ++i)
-		free(gen.types.name[i]);
-	free(gen.types.name);
-
-	return r;
-}
-
-/* Compute the set of inner array elements that may have their values
- * preserved by "prog".  In particular, collect the array elements of
- * arrays that are not local to "prog" and remove those elements that
- * are definitely killed or definitely written by "prog".
- */
-__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog)
-{
-	int i;
-	isl_union_set *may_persist, *killed;
-	isl_union_map *must_kill;
-
-	may_persist = isl_union_set_empty(isl_set_get_space(prog->context));
-	for (i = 0; i < prog->n_array; ++i) {
-		isl_set *extent;
-
-		if (prog->array[i].local)
-			continue;
-
-		extent = isl_set_copy(prog->array[i].extent);
-		may_persist = isl_union_set_add_set(may_persist, extent);
-	}
-
-	may_persist = isl_union_set_intersect_params(may_persist,
-						isl_set_copy(prog->context));
-	may_persist = isl_union_set_apply(may_persist,
-					isl_union_map_copy(prog->to_inner));
-	must_kill = isl_union_map_copy(prog->tagged_must_kill);
-	killed = isl_union_map_range(must_kill);
-	must_kill = isl_union_map_copy(prog->must_write);
-	killed = isl_union_set_union(killed, isl_union_map_range(must_kill));
-
-	may_persist = isl_union_set_subtract(may_persist, killed);
-	return may_persist;
-}
-
-struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
-{
-	struct gpu_prog *prog;
-	isl_space *space;
-	isl_map *id;
-
-	if (!scop)
-		return NULL;
-
-	prog = isl_calloc_type(ctx, struct gpu_prog);
-	assert(prog);
-
-	prog->ctx = ctx;
-	prog->scop = scop;
-	prog->context = isl_set_copy(scop->context);
-	prog->n_stmts = scop->pet->n_stmt;
-	prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet);
-	prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer);
-	space = isl_union_map_get_space(prog->any_to_outer);
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, 1);
-	space = isl_space_map_from_set(space);
-	id = isl_map_identity(space);
-	prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id);
-	prog->stmts = extract_stmts(ctx, scop, prog->any_to_outer);
-	prog->read = isl_union_map_copy(scop->reads);
-	prog->may_write = isl_union_map_copy(scop->may_writes);
-	prog->must_write = isl_union_map_copy(scop->must_writes);
-	prog->tagged_must_kill = isl_union_map_copy(scop->tagged_must_kills);
-	prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet);
-	prog->to_outer = isl_union_map_copy(prog->to_inner);
-	prog->to_outer = isl_union_map_reverse(prog->to_outer);
-
-	if (!prog->stmts)
-		return gpu_prog_free(prog);
-
-	if (collect_array_info(prog) < 0)
-		return gpu_prog_free(prog);
-	prog->may_persist = compute_may_persist(prog);
-
-	return prog;
-}
-
-void *gpu_prog_free(struct gpu_prog *prog)
-{
-	if (!prog)
-		return NULL;
-	free_array_info(prog);
-	free_stmts(prog->stmts, prog->n_stmts);
-	isl_union_map_free(prog->any_to_outer);
-	isl_union_map_free(prog->to_outer);
-	isl_union_map_free(prog->to_inner);
-	isl_union_map_free(prog->read);
-	isl_union_map_free(prog->may_write);
-	isl_union_map_free(prog->must_write);
-	isl_union_map_free(prog->tagged_must_kill);
-	isl_union_map_free(prog->array_order);
-	isl_union_set_free(prog->may_persist);
-	isl_set_free(prog->context);
-	free(prog);
-	return NULL;
-}
diff --git a/polly/lib/External/ppcg/gpu_array_tile.h b/polly/lib/External/ppcg/gpu_array_tile.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_array_tile.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef GPU_ARRAY_TILE_H
-#define GPU_ARRAY_TILE_H
-
-#include <isl/aff_type.h>
-#include <isl/map_type.h>
-#include <isl/val.h>
-
-/* The fields stride and shift only contain valid information
- * if shift != NULL.
- * If so, they express that current index is such that if you add shift,
- * then the result is always a multiple of stride.
- * Let D represent the initial tile->depth dimensions of the computed schedule.
- * The spaces of "lb" and "shift" are of the form
- *
- *	D -> [b]
- */
-struct gpu_array_bound {
-	isl_val *size;
-	isl_aff *lb;
-
-	isl_val *stride;
-	isl_aff *shift;
-};
-
-/* A tile of an outer array.
- *
- * requires_unroll is set if the schedule dimensions that are mapped
- * to threads need to be unrolled for this (private) tile to be used.
- *
- * "depth" reflects the number of schedule dimensions that affect the tile.
- * The copying into and/or out of the tile is performed at that depth.
- *
- * n is the dimension of the array.
- * bound is an array of size "n" representing the lower bound
- *	and size for each index.
- *
- * tiling maps a tile in the global array to the corresponding
- * shared/private memory tile and is of the form
- *
- *	{ [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
- *
- * where D represents the initial "depth" dimensions
- * of the computed schedule.
- */
-struct gpu_array_tile {
-	isl_ctx *ctx;
-	int requires_unroll;
-	int depth;
-	int n;
-	struct gpu_array_bound *bound;
-	isl_multi_aff *tiling;
-};
-
-struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index);
-struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile);
-
-__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile);
-
-#endif
diff --git a/polly/lib/External/ppcg/gpu_array_tile.c b/polly/lib/External/ppcg/gpu_array_tile.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_array_tile.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <isl/aff.h>
-#include <isl/map.h>
-
-#include "gpu_array_tile.h"
-
-struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile)
-{
-	int j;
-
-	if (!tile)
-		return NULL;
-
-	for (j = 0; j < tile->n; ++j) {
-		isl_val_free(tile->bound[j].size);
-		isl_val_free(tile->bound[j].stride);
-		isl_aff_free(tile->bound[j].lb);
-		isl_aff_free(tile->bound[j].shift);
-	}
-	free(tile->bound);
-	isl_multi_aff_free(tile->tiling);
-	free(tile);
-
-	return NULL;
-}
-
-/* Create a gpu_array_tile for an array of dimension "n_index".
- */
-struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index)
-{
-	int i;
-	struct gpu_array_tile *tile;
-
-	tile = isl_calloc_type(ctx, struct gpu_array_tile);
-	if (!tile)
-		return NULL;
-
-	tile->ctx = ctx;
-	tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
-	if (!tile->bound)
-		return gpu_array_tile_free(tile);
-
-	tile->n = n_index;
-
-	for (i = 0; i < n_index; ++i) {
-		tile->bound[i].size = NULL;
-		tile->bound[i].lb = NULL;
-		tile->bound[i].stride = NULL;
-		tile->bound[i].shift = NULL;
-	}
-
-	return tile;
-}
-
-/* Compute the size of the tile specified by "tile"
- * in number of elements and return the result.
- */
-__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile)
-{
-	int i;
-	isl_val *size;
-
-	if (!tile)
-		return NULL;
-
-	size = isl_val_one(tile->ctx);
-
-	for (i = 0; i < tile->n; ++i)
-		size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
-
-	return size;
-}
diff --git a/polly/lib/External/ppcg/gpu_group.h b/polly/lib/External/ppcg/gpu_group.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_group.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef GPU_GROUP_H
-#define GPU_GROUP_H
-
-#include <isl/schedule_node.h>
-#include "gpu.h"
-
-/* A group of array references in a kernel that should be handled together.
- * If private_tile is not NULL, then it is mapped to registers.
- * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
- * Otherwise, it is accessed from global memory.
- * Note that if both private_tile and shared_tile are set, then shared_tile
- * is only used inside group_common_shared_memory_tile.
- */
-struct gpu_array_ref_group {
-	/* The references in this group access this local array. */
-	struct gpu_local_array_info *local_array;
-	/* This is the corresponding array. */
-	struct gpu_array_info *array;
-	/* Position of this group in the list of reference groups of array. */
-	int nr;
-
-	/* The following fields are use during the construction of the groups.
-	 * access is the combined access relation relative to the private
-	 * memory tiling.  In particular, the domain of the map corresponds
-	 * to the first thread_depth dimensions of the kernel schedule.
-	 * write is set if any access in the group is a write.
-	 * exact_write is set if all writes are definite writes.
-	 * slice is set if there is at least one access in the group
-	 * that refers to more than one element
-	 * "min_depth" is the minimum of the tile depths and thread_depth.
-	 */
-	isl_map *access;
-	int write;
-	int exact_write;
-	int slice;
-	int min_depth;
-
-	/* The shared memory tile, NULL if none. */
-	struct gpu_array_tile *shared_tile;
-
-	/* The private memory tile, NULL if none. */
-	struct gpu_array_tile *private_tile;
-
-	/* References in this group; point to elements of a linked list. */
-	int n_ref;
-	struct gpu_stmt_access **refs;
-};
-
-int gpu_group_references(struct ppcg_kernel *kernel,
-	__isl_keep isl_schedule_node *node);
-
-__isl_give isl_printer *gpu_array_ref_group_print_name(
-	struct gpu_array_ref_group *group, __isl_take isl_printer *p);
-void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group);
-__isl_give isl_union_map *gpu_array_ref_group_access_relation(
-	struct gpu_array_ref_group *group, int read, int write);
-int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group);
-enum ppcg_group_access_type gpu_array_ref_group_type(
-	struct gpu_array_ref_group *group);
-struct gpu_array_tile *gpu_array_ref_group_tile(
-	struct gpu_array_ref_group *group);
-struct gpu_array_ref_group *gpu_array_ref_group_free(
-	struct gpu_array_ref_group *group);
-
-#endif
diff --git a/polly/lib/External/ppcg/gpu_group.c b/polly/lib/External/ppcg/gpu_group.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_group.c
+++ /dev/null
@@ -1,1828 +0,0 @@
-/*
- * Copyright 2010-2011 INRIA Saclay
- * Copyright 2012-2014 Ecole Normale Superieure
- * Copyright 2015      Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <isl/constraint.h>
-#include <isl/ilp.h>
-
-#include "gpu_array_tile.h"
-#include "gpu_group.h"
-#include "gpu_tree.h"
-#include "schedule.h"
-
-/* Print the name of the local copy of a given group of array references.
- */
-__isl_give isl_printer *gpu_array_ref_group_print_name(
-	struct gpu_array_ref_group *group, __isl_take isl_printer *p)
-{
-	int global = 0;
-	enum ppcg_group_access_type type;
-
-	type = gpu_array_ref_group_type(group);
-	if (type == ppcg_access_private)
-		p = isl_printer_print_str(p, "private_");
-	else if (type == ppcg_access_shared)
-		p = isl_printer_print_str(p, "shared_");
-	else
-		global = 1;
-	p = isl_printer_print_str(p, group->array->name);
-	if (!global && group->local_array->n_group > 1) {
-		p = isl_printer_print_str(p, "_");
-		p = isl_printer_print_int(p, group->nr);
-	}
-
-	return p;
-}
-
-/* Return the union of all read (read = 1) and/or write (write = 1)
- * access relations in the group.
- */
-__isl_give isl_union_map *gpu_array_ref_group_access_relation(
-	struct gpu_array_ref_group *group, int read, int write)
-{
-	int i;
-	isl_union_map *access;
-
-	access = isl_union_map_empty(isl_map_get_space(group->access));
-	for (i = 0; i < group->n_ref; ++i) {
-		isl_map *map_i;
-
-		if (!((read && group->refs[i]->read) ||
-		     (write && group->refs[i]->write)))
-			continue;
-		map_i = isl_map_copy(group->refs[i]->access);
-		access = isl_union_map_union(access,
-					    isl_union_map_from_map(map_i));
-	}
-
-	return access;
-}
-
-/* Should this array reference group be mapped to private, shared or global
- * memory?
- * If we have computed both a private and a shared tile, then
- * the tile with the smallest depth is used.  If both have the same depth,
- * then the private tile is used.
- */
-enum ppcg_group_access_type gpu_array_ref_group_type(
-	struct gpu_array_ref_group *group)
-{
-	if (group->private_tile && group->shared_tile &&
-	    group->shared_tile->depth < group->private_tile->depth)
-		return ppcg_access_shared;
-	if (group->private_tile)
-		return ppcg_access_private;
-	if (group->shared_tile)
-		return ppcg_access_shared;
-	return ppcg_access_global;
-}
-
-
-/* Return the effective gpu_array_tile associated to "group" or
- * NULL if there is no such gpu_array_tile.
- */
-struct gpu_array_tile *gpu_array_ref_group_tile(
-	struct gpu_array_ref_group *group)
-{
-	switch (gpu_array_ref_group_type(group)) {
-	case ppcg_access_global:
-		return NULL;
-	case ppcg_access_shared:
-		return group->shared_tile;
-	case ppcg_access_private:
-		return group->private_tile;
-	}
-}
-
-/* Does the tile associated to "group" require unrolling of the schedule
- * dimensions mapped to threads?
- * Note that this can only happen for private tiles.
- */
-int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group)
-{
-	struct gpu_array_tile *tile;
-
-	tile = gpu_array_ref_group_tile(group);
-	if (!tile)
-		return 0;
-	return tile->requires_unroll;
-}
-
-/* Given a constraint
- *
- *		a(p,i) + j = g f(e)
- *
- * or -a(p,i) - j = g f(e) if sign < 0,
- * store a(p,i) in bound->shift and g (stride) in bound->stride.
- * a(p,i) is assumed to be an expression in only the parameters
- * and the input dimensions.
- */
-static void extract_stride(__isl_keep isl_constraint *c,
-	struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
-{
-	int i;
-	isl_val *v;
-	isl_space *space;
-	unsigned nparam;
-	unsigned nvar;
-	isl_aff *aff;
-
-	isl_val_free(bound->stride);
-	bound->stride = isl_val_copy(stride);
-
-	space = isl_constraint_get_space(c);
-	space = isl_space_domain(space);
-
-	nparam = isl_space_dim(space, isl_dim_param);
-	nvar = isl_space_dim(space, isl_dim_set);
-
-	v = isl_constraint_get_constant_val(c);
-	if (sign < 0)
-		v = isl_val_neg(v);
-	aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
-	aff = isl_aff_set_constant_val(aff, v);
-
-	for (i = 0; i < nparam; ++i) {
-		if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
-			continue;
-		v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
-		if (sign < 0)
-			v = isl_val_neg(v);
-		aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
-	}
-
-	for (i = 0; i < nvar; ++i) {
-		if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
-			continue;
-		v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
-		if (sign < 0)
-			v = isl_val_neg(v);
-		aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
-	}
-
-	bound->shift = aff;
-}
-
-/* Given an equality constraint of a map with a single output dimension j,
- * check if the constraint is of the form
- *
- *		a(p,i) + j = g f(e)
- *
- * with a(p,i) an expression in the parameters and input dimensions
- * and f(e) an expression in the existentially quantified variables.
- * If so, and if g is larger than any such g from a previously considered
- * constraint, then call extract_stride to record the stride information
- * in bound.
- */
-static isl_stat check_stride_constraint(__isl_take isl_constraint *c,
-	void *user)
-{
-	int i;
-	isl_ctx *ctx;
-	isl_val *v;
-	unsigned n_div;
-	struct gpu_array_bound *bound = user;
-
-	ctx = isl_constraint_get_ctx(c);
-	n_div = isl_constraint_dim(c, isl_dim_div);
-	v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
-
-	if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
-		int s = isl_val_sgn(v);
-		isl_val *stride = isl_val_zero(ctx);
-
-		isl_val_free(v);
-		for (i = 0; i < n_div; ++i) {
-			v = isl_constraint_get_coefficient_val(c,
-								isl_dim_div, i);
-			stride = isl_val_gcd(stride, v);
-		}
-		if (!isl_val_is_zero(stride) &&
-		    isl_val_gt(stride, bound->stride))
-			extract_stride(c, bound, stride, s);
-
-		isl_val_free(stride);
-	} else
-		isl_val_free(v);
-
-	isl_constraint_free(c);
-	return isl_stat_ok;
-}
-
-/* Given contraints on an array index i, check if we can find
- * a shift a(p) and a stride g such that
- *
- *	a(p) + i = 0 mod g
- *
- * If so, record the information in bound and apply the mapping
- * i -> (i + a(p))/g to the array index in bounds and return
- * the new constraints.
- * If not, simply return the original constraints.
- *
- * If bounds is a subset of the space
- *
- *	D -> i
- *
- * then the bound recorded in bound->shift is of the form
- *
- *	D -> s(D)
- *
- * with s(D) equal to a(p) above.
- * Next, we construct a mapping of the form
- *
- *	[D -> i] -> [D -> (i + S(D))/g]
- *
- * This mapping is computed as follows.
- * We first introduce "i" in the domain through precomposition
- * with [D -> i] -> D obtaining
- *
- *	[D -> i] -> s(D)
- *
- * Adding [D -> i] -> i produces
- *
- *	[D -> i] -> i + s(D)
- *
- * and the domain product with [D -> i] -> D yields
- *
- *	[D -> i] -> [D -> i + s(D)]
- *
- * Composition with [D -> i] -> [D -> i/g] gives the desired result.
- */
-static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
-	__isl_take isl_basic_map *bounds)
-{
-	isl_space *space;
-	isl_basic_map *hull;
-	isl_basic_map *shift, *id, *bmap, *scale;
-	isl_basic_set *bset;
-	isl_aff *aff;
-
-	bound->stride = NULL;
-
-	hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
-
-	isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
-
-	isl_basic_map_free(hull);
-
-	if (!bound->stride)
-		return bounds;
-
-	shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
-	space = isl_basic_map_get_space(bounds);
-	bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
-	shift = isl_basic_map_apply_range(bmap, shift);
-	space = isl_basic_map_get_space(bounds);
-	id = isl_basic_map_range_map(isl_basic_map_universe(space));
-	shift = isl_basic_map_sum(id, shift);
-	space = isl_basic_map_get_space(bounds);
-	id = isl_basic_map_domain_map(isl_basic_map_universe(space));
-	shift = isl_basic_map_range_product(id, shift);
-
-	space = isl_space_domain(isl_basic_map_get_space(bounds));
-	id = isl_basic_map_identity(isl_space_map_from_set(space));
-	space = isl_space_range(isl_basic_map_get_space(bounds));
-	aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
-	aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
-	aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
-	scale = isl_basic_map_from_aff(aff);
-	scale = isl_basic_map_product(id, scale);
-
-	bmap = isl_basic_map_apply_range(shift, scale);
-	bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
-	bounds = isl_basic_set_unwrap(bset);
-
-	return bounds;
-}
-
-/* Data used in compute_array_dim_size and compute_size_in_direction.
- *
- * pos is the position of the variable representing the array index,
- * i.e., the variable for which want to compute the size.  This variable
- * is also the last variable in the set.
- */
-struct gpu_size_info {
-	isl_basic_set *bset;
-	struct gpu_array_bound *bound;
-	int pos;
-};
-
-/* Given a constraint from the basic set describing the bounds on
- * an array index, check if it is a lower bound, say m i >= b(x), and,
- * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
- * upper bound.  If so, and if this bound is smaller than any bound
- * derived from earlier constraints, set the size to this bound on
- * the expression and the lower bound to ceil(b(x)/m).
- */
-static isl_stat compute_size_in_direction(__isl_take isl_constraint *c,
-	void *user)
-{
-	struct gpu_size_info *size = user;
-	unsigned nparam;
-	unsigned n_div;
-	isl_val *v;
-	isl_aff *aff;
-	isl_aff *lb;
-
-	nparam = isl_basic_set_dim(size->bset, isl_dim_param);
-	n_div = isl_constraint_dim(c, isl_dim_div);
-
-	if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
-	    !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
-		isl_constraint_free(c);
-		return isl_stat_ok;
-	}
-
-	aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
-	aff = isl_aff_ceil(aff);
-
-	lb = isl_aff_copy(aff);
-
-	aff = isl_aff_neg(aff);
-	aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
-
-	v = isl_basic_set_max_val(size->bset, aff);
-	isl_aff_free(aff);
-
-	if (isl_val_is_int(v)) {
-		v = isl_val_add_ui(v, 1);
-		if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
-			isl_val_free(size->bound->size);
-			size->bound->size = isl_val_copy(v);
-			lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
-			isl_aff_free(size->bound->lb);
-			size->bound->lb = isl_aff_copy(lb);
-		}
-	}
-	isl_val_free(v);
-	isl_aff_free(lb);
-
-	isl_constraint_free(c);
-
-	return isl_stat_ok;
-}
-
-/* Given a basic map "bounds" that maps parameters and input dimensions
- * to a single output dimension, look for an expression in the parameters
- * and input dimensions such that the range of the output dimension shifted
- * by this expression is a constant.
- *
- * In particular, we currently only consider lower bounds on the output
- * dimension as candidate expressions.
- */
-static int compute_array_dim_size(struct gpu_array_bound *bound,
-	__isl_take isl_basic_map *bounds)
-{
-	struct gpu_size_info size;
-
-	bounds = isl_basic_map_detect_equalities(bounds);
-	bounds = check_stride(bound, bounds);
-
-	bound->size = NULL;
-	bound->lb = NULL;
-
-	size.bound = bound;
-	size.pos = isl_basic_map_dim(bounds, isl_dim_in);
-	size.bset = isl_basic_map_wrap(bounds);
-	size.bset = isl_basic_set_flatten(size.bset);
-	size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
-	isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
-					&size);
-	isl_basic_set_free(size.bset);
-
-	return bound->size ? 0 : -1;
-}
-
-/* Check if we can find a memory tile for the given array
- * based on the given accesses, and if so, put the results in "tile".
- *
- * We project the accesses on each index in turn and look for a parametric
- * offset such that the size is constant.
- *
- * tile->depth is initialized to the input dimension of the computed bounds.
- */
-static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
-{
-	int i;
-
-	tile->depth = isl_map_dim(access, isl_dim_in);
-
-	for (i = 0; i < tile->n; ++i) {
-		isl_map *access_i;
-		isl_basic_map *hull;
-
-		access_i = isl_map_copy(access);
-		access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
-		access_i = isl_map_project_out(access_i, isl_dim_out,
-					    1, tile->n - (i + 1));
-		access_i = isl_map_compute_divs(access_i);
-		hull = isl_map_simple_hull(access_i);
-		if (compute_array_dim_size(&tile->bound[i], hull) < 0)
-			return 0;
-	}
-
-	return 1;
-}
-
-/* Internal data structure for gpu_group_references.
- *
- * scop represents the input scop.
- * kernel_depth is the schedule depth where the kernel launch will
- * be introduced, i.e., it is the depth of the band that is mapped
- * to blocks.
- * shared_depth is the schedule depth at which the copying to/from
- * shared memory is computed.  The copy operation may then
- * later be hoisted to a higher level.
- * thread_depth is the schedule depth where the thread mark is located,
- * i.e., it is the depth of the band that is mapped to threads and also
- * the schedule depth at which the copying to/from private memory
- * is computed.  The copy operation may then later be hoisted to
- * a higher level.
- * n_thread is the number of schedule dimensions in the band that
- * is mapped to threads.
- * privatization lives in the range of thread_sched (i.e., it is
- * of dimension thread_depth + n_thread) and encodes the mapping
- * to thread identifiers (as parameters).
- * host_sched contains the kernel_depth dimensions of the host schedule.
- * shared_sched contains the first shared_depth dimensions of the
- * kernel schedule.
- * copy_sched contains the first thread_depth dimensions of the
- * kernel schedule.
- * thread_sched contains the first (thread_depth + n_thread) dimensions
- * of the kernel schedule.
- * full_sched is a union_map representation of the entire kernel schedule.
- * The schedules are all formulated in terms of the original statement
- * instances, i.e., those that appear in the domains of the access
- * relations.
- */
-struct gpu_group_data {
-	struct ppcg_scop *scop;
-	int kernel_depth;
-	int shared_depth;
-	int thread_depth;
-	int n_thread;
-	isl_set *privatization;
-	isl_union_map *host_sched;
-	isl_union_map *shared_sched;
-	isl_union_map *copy_sched;
-	isl_union_map *thread_sched;
-	isl_union_map *full_sched;
-};
-
-/* Construct a map from domain_space to domain_space that increments
- * the dimension at position "pos" and leaves all other dimensions
- * constant.
- */
-static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos)
-{
-	isl_space *space;
-	isl_aff *aff;
-	isl_multi_aff *next;
-
-	space = isl_space_map_from_set(domain_space);
-	next = isl_multi_aff_identity(space);
-	aff = isl_multi_aff_get_aff(next, pos);
-	aff = isl_aff_add_constant_si(aff, 1);
-	next = isl_multi_aff_set_aff(next, pos, aff);
-
-	return isl_map_from_multi_aff(next);
-}
-
-/* Check if the given access is coalesced (or if there is no point
- * in trying to coalesce the access by mapping the array to shared memory).
- * That is, check whether incrementing the dimension that will get
- * wrapped over the last thread index results in incrementing
- * the last array index.
- *
- * If no two consecutive array elements are ever accessed by "access",
- * then mapping the corresponding array to shared memory will not
- * improve coalescing.  In fact, the copying will likely be performed
- * by a single thread.  Consider the access as coalesced such that
- * the caller will not try and map the array to shared memory just
- * to improve coalescing.
- *
- * This function is only called for access relations without reuse and
- * kernels with at least one thread identifier.
- */
-static int access_is_coalesced(struct gpu_group_data *data,
-	__isl_keep isl_union_map *access)
-{
-	int dim;
-	isl_space *space;
-	isl_set *accessed;
-	isl_map *access_map;
-	isl_map *next_thread_x;
-	isl_map *next_element;
-	isl_map *map;
-	int coalesced, empty;
-
-	access = isl_union_map_copy(access);
-	access = isl_union_map_apply_domain(access,
-				isl_union_map_copy(data->full_sched));
-	access_map = isl_map_from_union_map(access);
-
-	space = isl_map_get_space(access_map);
-	space = isl_space_range(space);
-	dim = isl_space_dim(space, isl_dim_set);
-	if (dim == 0)
-		next_element = isl_map_empty(isl_space_map_from_set(space));
-	else
-		next_element = next(space, dim - 1);
-
-	accessed = isl_map_range(isl_map_copy(access_map));
-	map = isl_map_copy(next_element);
-	map = isl_map_intersect_domain(map, isl_set_copy(accessed));
-	map = isl_map_intersect_range(map, accessed);
-	empty = isl_map_is_empty(map);
-	isl_map_free(map);
-
-	if (empty < 0 || empty) {
-		isl_map_free(next_element);
-		isl_map_free(access_map);
-		return empty;
-	}
-
-	space = isl_map_get_space(access_map);
-	space = isl_space_domain(space);
-	next_thread_x = next(space, data->thread_depth + data->n_thread - 1);
-
-	map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
-	map = isl_map_apply_range(map, access_map);
-
-	coalesced = isl_map_is_subset(map, next_element);
-
-	isl_map_free(next_element);
-	isl_map_free(map);
-
-	return coalesced;
-}
-
-/* Replace the host schedule dimensions in the access relation "access"
- * by parameters, so that they are treated as fixed when checking for reuse
- * (within a kernel) or whether two consecutive elements are accessed
- * (within a kernel).
- */
-static __isl_give isl_union_map *localize_access(struct gpu_group_data *data,
-	__isl_take isl_union_map *access)
-{
-	int n;
-	isl_space *space;
-	isl_set *param;
-	isl_union_map *umap;
-	isl_id_list *ids;
-
-	umap = isl_union_map_copy(data->host_sched);
-	space = isl_union_map_get_space(umap);
-	n = data->kernel_depth;
-	ids = ppcg_scop_generate_names(data->scop, n, "__ppcg_host_");
-	param = parametrization(space, n, 0, ids);
-	isl_id_list_free(ids);
-	umap = isl_union_map_intersect_range(umap,
-						isl_union_set_from_set(param));
-	access = isl_union_map_intersect_domain(access,
-						isl_union_map_domain(umap));
-
-	return access;
-}
-
-/* Given an access relation in terms of at least data->thread_depth initial
- * dimensions of the computed schedule, check if it is bijective for
- * fixed values of the first data->thread_depth dimensions.
- * We perform this check by equating these dimensions to parameters.
- */
-static int access_is_bijective(struct gpu_group_data *data,
-	__isl_keep isl_map *access)
-{
-	int res;
-	int dim;
-	isl_set *par;
-	isl_space *space;
-	isl_id_list *ids;
-
-	access = isl_map_copy(access);
-	space = isl_space_params(isl_map_get_space(access));
-	ids = ppcg_scop_generate_names(data->scop, data->thread_depth, "s");
-	dim = isl_map_dim(access, isl_dim_in);
-	par = parametrization(space, dim, 0, ids);
-	isl_id_list_free(ids);
-	access = isl_map_intersect_domain(access, par);
-	res = isl_map_is_bijective(access);
-	isl_map_free(access);
-
-	return res;
-}
-
-/* Compute the number of outer schedule tile dimensions that affect
- * the offset of "tile".
- * If there is no such dimension, then return the index
- * of the first kernel dimension, i.e., data->kernel_depth.
- */
-static int compute_tile_depth(struct gpu_group_data *data,
-	struct gpu_array_tile *tile)
-{
-	int i, j;
-
-	for (j = tile->depth - 1; j >= data->kernel_depth; --j) {
-		for (i = 0; i < tile->n; ++i) {
-			isl_aff *lb;
-			isl_aff *shift;
-
-			lb = tile->bound[i].lb;
-			if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
-				break;
-
-			shift = tile->bound[i].shift;
-			if (!shift)
-				continue;
-			if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
-				break;
-		}
-		if (i < tile->n)
-			break;
-	}
-
-	return ++j;
-}
-
-/* Return the lowest depth between data->kernel_depth and data->thread_depth
- * at which every array element accessed through "acc" is accessed
- * by a single thread.  The input dimension of "acc" is
- * data->thread_depth + data->n_thread, where the final data->n_thread
- * dimensions are those that will be mapped to threads.
- * If the values for these dimensions are uniquely determined
- * by the array index and a given number of outer dimensions, then
- * there is only one thread accessing that array element within those
- * outer dimensions.
- *
- * The input space of "acc" is first split up, such that it has the form
- *
- *	[O -> T] -> A
- *
- * with O the outer dimensions, T the dimensions that will be mapped to threads
- * and A the array index.
- *
- * Then the positions of T and A are interchanged to simplify the test
- * whether T uniquely depends on O and A.
- * In particular, the above access relation is first combined with
- *
- *	[O -> T] -> T
- *
- * to form
- *
- *	[O -> T] -> [A -> T]
- *
- * from which
- *
- *	O -> [A -> T]
- *
- * is extracted, which is then uncurried to
- *
- *	[O -> A] -> T
- *
- * Finally, the final dimensions of O are projected out one by one
- * until T is no longer uniquely determined by A and the remaining
- * dimensions in O.  The value returned is that of the last dimension
- * that was successfully projected out.
- * Note that there is no need to test whether [O -> A] -> T itself
- * is single-valued as that was already tested in access_is_bijective.
- */
-static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data,
-	__isl_keep isl_map *acc)
-{
-	int i;
-	isl_space *space;
-	isl_map *map;
-	isl_bool sv;
-
-	if (data->thread_depth == data->kernel_depth)
-		return data->thread_depth;
-
-	acc = isl_map_copy(acc);
-
-	space = isl_map_get_space(acc);
-	space = isl_space_params(space);
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, data->thread_depth);
-	space = isl_space_from_domain(space);
-	space = isl_space_add_dims(space, isl_dim_out, data->n_thread);
-	space = isl_space_wrap(space);
-	map = isl_set_flatten_map(isl_set_universe(space));
-	acc = isl_map_apply_range(map, acc);
-
-	space = isl_space_domain(isl_map_get_space(acc));
-	map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
-	acc = isl_map_range_product(acc, map);
-	acc = isl_map_domain_factor_domain(acc);
-	acc = isl_map_uncurry(acc);
-
-	for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) {
-		acc = isl_map_project_out(acc, isl_dim_in, i, 1);
-		sv = isl_map_is_single_valued(acc);
-		if (sv < 0)
-			return -1;
-		if (!sv)
-			break;
-	}
-
-	isl_map_free(acc);
-
-	return ++i;
-}
-
-/* Adjust the fields of "tile" to reflect the new input dimension "depth".
- * The dimension beyond "depth" are assumed not to affect the tile,
- * so they can simply be dropped.
- */
-static int tile_adjust_depth(struct gpu_array_tile *tile, int depth)
-{
-	int i;
-
-	if (tile->depth == depth)
-		return 0;
-
-	for (i = 0; i < tile->n; ++i) {
-		tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb,
-					isl_dim_in, depth, tile->depth - depth);
-		if (!tile->bound[i].lb)
-			return -1;
-		if (!tile->bound[i].shift)
-			continue;
-		tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift,
-					isl_dim_in, depth, tile->depth - depth);
-		if (!tile->bound[i].shift)
-			return -1;
-	}
-
-	tile->depth = depth;
-
-	return 0;
-}
-
-/* Determine the number of schedule dimensions that affect the offset of the
- * shared or private tile "tile" and store the result in tile->depth, with
- * a lower bound of data->kernel_depth.
- * Also adjust the fields of the tile to only refer to the tile->depth
- * outer schedule dimensions.
- */
-static isl_stat tile_set_depth(struct gpu_group_data *data,
-	struct gpu_array_tile *tile)
-{
-	if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0)
-		return isl_stat_error;
-
-	return isl_stat_ok;
-}
-
-/* Determine the number of schedule dimensions that affect the offset of the
- * shared tile and store the minimum of the private and shared tile depth
- * in group->min_depth, with a lower bound of data->kernel_depth.
- * If there is no tile defined on the array reference group,
- * then set group->min_depth to data->thread_depth.
- */
-static int set_depth(struct gpu_group_data *data,
-	struct gpu_array_ref_group *group)
-{
-	group->min_depth = data->thread_depth;
-
-	if (group->private_tile) {
-		if (group->private_tile->depth < group->min_depth)
-			group->min_depth = group->private_tile->depth;
-	}
-	if (group->shared_tile) {
-		if (tile_set_depth(data, group->shared_tile) < 0)
-			return -1;
-		if (group->shared_tile->depth < group->min_depth)
-			group->min_depth = group->shared_tile->depth;
-	}
-
-	return 0;
-}
-
-/* Fill up the groups array with singleton groups, i.e., one group
- * per reference, initializing the array, access, write, n_ref and refs fields.
- * In particular the access field is initialized to the scheduled
- * access relation of the array reference.
- *
- * Return the number of elements initialized, i.e., the number of
- * active references in the current kernel.
- */
-static int populate_array_references(struct gpu_local_array_info *local,
-	struct gpu_array_ref_group **groups, struct gpu_group_data *data)
-{
-	int i;
-	int n;
-	isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched);
-
-	n = 0;
-	for (i = 0; i < local->array->n_ref; ++i) {
-		isl_union_map *umap;
-		isl_map *map;
-		struct gpu_array_ref_group *group;
-		struct gpu_stmt_access *access = local->array->refs[i];
-
-		map = isl_map_copy(access->access);
-		umap = isl_union_map_from_map(map);
-		umap = isl_union_map_apply_domain(umap,
-				isl_union_map_copy(data->copy_sched));
-
-		if (isl_union_map_is_empty(umap)) {
-			isl_union_map_free(umap);
-			continue;
-		}
-
-		map = isl_map_from_union_map(umap);
-		map = isl_map_detect_equalities(map);
-
-		group = isl_calloc_type(ctx, struct gpu_array_ref_group);
-		if (!group)
-			return -1;
-		group->local_array = local;
-		group->array = local->array;
-		group->access = map;
-		group->write = access->write;
-		group->exact_write = access->exact_write;
-		group->slice = access->n_index < local->array->n_index;
-		group->refs = &local->array->refs[i];
-		group->n_ref = 1;
-
-		groups[n++] = group;
-	}
-
-	return n;
-}
-
-/* If group->n_ref == 1, then group->refs was set by
- * populate_array_references to point directly into
- * group->array->refs and should not be freed.
- * If group->n_ref > 1, then group->refs was set by join_groups
- * to point to a newly allocated array.
- */
-struct gpu_array_ref_group *gpu_array_ref_group_free(
-	struct gpu_array_ref_group *group)
-{
-	if (!group)
-		return NULL;
-	gpu_array_tile_free(group->shared_tile);
-	gpu_array_tile_free(group->private_tile);
-	isl_map_free(group->access);
-	if (group->n_ref > 1)
-		free(group->refs);
-	free(group);
-	return NULL;
-}
-
-/* Check if the access relations of group1 and group2 overlap within
- * copy_sched.
- */
-static int accesses_overlap(struct gpu_array_ref_group *group1,
-	struct gpu_array_ref_group *group2)
-{
-	int disjoint;
-
-	disjoint = isl_map_is_disjoint(group1->access, group2->access);
-	if (disjoint < 0)
-		return -1;
-
-	return !disjoint;
-}
-
-/* Combine the given two groups into a single group, containing
- * the references of both groups.
- */
-static struct gpu_array_ref_group *join_groups(
-	struct gpu_array_ref_group *group1,
-	struct gpu_array_ref_group *group2)
-{
-	int i;
-	isl_ctx *ctx;
-	struct gpu_array_ref_group *group;
-
-	if (!group1 || !group2)
-		return NULL;
-
-	ctx = isl_map_get_ctx(group1->access);
-	group = isl_calloc_type(ctx, struct gpu_array_ref_group);
-	if (!group)
-		return NULL;
-	group->local_array = group1->local_array;
-	group->array = group1->array;
-	group->access = isl_map_union(isl_map_copy(group1->access),
-					isl_map_copy(group2->access));
-	group->write = group1->write || group2->write;
-	group->exact_write = group1->exact_write && group2->exact_write;
-	group->slice = group1->slice || group2->slice;
-	group->n_ref = group1->n_ref + group2->n_ref;
-	group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
-					group->n_ref);
-	if (!group->refs)
-		return gpu_array_ref_group_free(group);
-	for (i = 0; i < group1->n_ref; ++i)
-		group->refs[i] = group1->refs[i];
-	for (i = 0; i < group2->n_ref; ++i)
-		group->refs[group1->n_ref + i] = group2->refs[i];
-
-	return group;
-}
-
-/* Combine the given two groups into a single group and free
- * the original two groups.
- */
-static struct gpu_array_ref_group *join_groups_and_free(
-	struct gpu_array_ref_group *group1,
-	struct gpu_array_ref_group *group2)
-{
-	struct gpu_array_ref_group *group;
-
-	group = join_groups(group1, group2);
-	gpu_array_ref_group_free(group1);
-	gpu_array_ref_group_free(group2);
-	return group;
-}
-
-/* Report that the array reference group with the given access relation
- * is not mapped to shared memory in the given kernel because
- * it does not exhibit any reuse and is considered to be coalesced.
- */
-static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel,
-	__isl_keep isl_union_map *access)
-{
-	isl_ctx *ctx;
-	isl_printer *p;
-
-	ctx = isl_union_map_get_ctx(access);
-	p = isl_printer_to_file(ctx, stdout);
-	p = isl_printer_print_str(p, "Array reference group ");
-	p = isl_printer_print_union_map(p, access);
-	p = isl_printer_print_str(p,
-	    " not considered for mapping to shared memory in kernel");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p,
-	    " because it exhibits no reuse and is considered to be coalesced");
-	p = isl_printer_end_line(p);
-	isl_printer_free(p);
-}
-
-/* Given an access relation in terms of the data->thread_depth initial
- * dimensions of the computed schedule and the thread identifiers
- * (as parameters), check if the use of the corresponding private tile
- * requires unrolling.
- *
- * If we are creating a private tile because we are forced to,
- * then no unrolling is required.
- * Otherwise we check if "access" is bijective and unrolling
- * is required if it is not.  Note that the access relation
- * has already been determined to be bijective before the introduction
- * of the thread identifiers and the removal of the schedule dimensions
- * that are mapped to these threads.  If the access relation is no longer
- * bijective, then this means that more than one value of one of those
- * schedule dimensions is mapped to the same thread and therefore
- * unrolling is required.
- */
-static int check_requires_unroll(struct gpu_group_data *data,
-	__isl_keep isl_map *access, int force_private)
-{
-	int bijective;
-
-	if (force_private)
-		return 0;
-	bijective = access_is_bijective(data, access);
-	if (bijective < 0)
-		return -1;
-	return !bijective;
-}
-
-/* Map the domain of "access" to the outer data->shared_depth
- * schedule dimensions.  When data->shared_depth is equal to
- * data->thread_depth, this result is already available in group->access.
- */
-static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group,
-	__isl_keep isl_union_map *access, struct gpu_group_data *data)
-{
-	isl_union_map *shared;
-
-	if (data->shared_depth == data->thread_depth)
-		return isl_map_copy(group->access);
-
-	shared = isl_union_map_copy(access);
-	shared = isl_union_map_apply_domain(shared,
-			isl_union_map_copy(data->shared_sched));
-	return isl_map_from_union_map(shared);
-}
-
-/* Compute the private and/or shared memory tiles for the array
- * reference group "group" of array "array".
- * Return 0 on success and -1 on error.
- *
- * If the array is a read-only scalar or if the user requested
- * not to use shared or private memory, then we do not need to do anything.
- *
- * If any reference in the reference group accesses more than one element,
- * then we would have to make sure that the layout in shared memory
- * is the same as that in global memory.  Since we do not handle this yet
- * (and it may not even be possible), we refuse to map to private or
- * shared memory in such cases.
- *
- * If the array group involves any may writes (that are not must writes),
- * then we would have to make sure that we load the data into shared/private
- * memory first in case the data is not written by the kernel
- * (but still written back out to global memory).
- * Since we don't have any such mechanism at the moment, we don't
- * compute shared/private tiles for groups involving may writes.
- *
- * We only try to compute a shared memory tile if there is any reuse
- * or if the access is not coalesced.
- * Reuse and coalescing are checked within the given kernel.
- *
- * For computing a private memory tile, we also require that there is
- * some reuse.  Moreover, we require that the access is private
- * to the thread.  That is, we check that any given array element
- * is only accessed by a single thread.
- * We compute an access relation that maps the outer
- * data->thread_depth + data->n_thread schedule dimensions.
- * The latter data->n_thread will be mapped to thread identifiers.
- * We actually check that those iterators that will be wrapped
- * partition the array space.  This check is stricter than necessary
- * since several iterations may be mapped onto the same thread
- * and then they could be allowed to access the same memory elements,
- * but our check does not allow this situation.
- *
- * For private memory tiles, the number of schedule dimensions that
- * affect the offset is computed and stored in tile->depth, with
- * a lower bound of data->kernel_depth.  If this depth is smaller
- * than the minimal depth that still ensures that every element
- * is accessed by a single thread, then the depth is raised
- * to this minimal depth.
- * The fields of the tile are then adjusted to only refer to the tile->depth
- * outer schedule dimensions.
- *
- * We also check that the index expression only depends on parallel
- * loops.  That way, we can move those loops innermost and unroll them.
- * Again, we use a test that is stricter than necessary.
- * We actually check whether the index expression only depends
- * on the iterators that are wrapped over the threads.
- * These are necessarily parallel, but there may be more parallel loops.
- *
- * Combining the injectivity of the first test with the single-valuedness
- * of the second test, we simply test for bijectivity.
- *
- * If the use of the private tile requires unrolling, but some
- * of the other arrays are forcibly mapped to private memory,
- * then we do not allow the use of this private tile since
- * we cannot move the schedule dimensions that need to be unrolled down
- * without performing some kind of expansion on those arrays
- * that are forcibly mapped to private memory.
- *
- * If the array is marked force_private, then we bypass all checks
- * and assume we can (and should) use registers only.
- *
- * If it turns out we can (or have to) use registers, we compute
- * the private memory tile size using can_tile, after introducing a dependence
- * on the thread indices.
- */
-static int compute_group_bounds_core(struct ppcg_kernel *kernel,
-	struct gpu_array_ref_group *group, struct gpu_group_data *data)
-{
-	isl_ctx *ctx = isl_space_get_ctx(group->array->space);
-	isl_union_map *access, *local;
-	int n_index = group->array->n_index;
-	int no_reuse, coalesced;
-	isl_map *acc;
-	int force_private = group->local_array->force_private;
-	int use_shared = !force_private && kernel->options->use_shared_memory &&
-				data->n_thread > 0;
-	int use_private = force_private || kernel->options->use_private_memory;
-	int r = 0;
-	int requires_unroll;
-	int unique_depth;
-
-	if (!use_shared && !use_private)
-		return 0;
-	if (gpu_array_is_read_only_scalar(group->array))
-		return 0;
-	if (!force_private && !group->exact_write)
-		return 0;
-	if (group->slice)
-		return 0;
-
-	access = gpu_array_ref_group_access_relation(group, 1, 1);
-	local = localize_access(data, isl_union_map_copy(access));
-	no_reuse = isl_union_map_is_injective(local);
-	if (no_reuse < 0)
-		r = -1;
-	if (use_shared && no_reuse)
-		coalesced = access_is_coalesced(data, local);
-	isl_union_map_free(local);
-
-	if (r >= 0 && kernel->options->debug->verbose &&
-	    use_shared && no_reuse && coalesced)
-		report_no_reuse_and_coalesced(kernel, access);
-
-	if (use_shared && (!no_reuse || !coalesced)) {
-		group->shared_tile = gpu_array_tile_create(ctx,
-							group->array->n_index);
-		acc = shared_access(group, access, data);
-		if (!group->shared_tile)
-			r = -1;
-		else if (!can_tile(acc, group->shared_tile))
-			group->shared_tile =
-					gpu_array_tile_free(group->shared_tile);
-		isl_map_free(acc);
-	}
-
-	if (r < 0 || (!force_private && (!use_private || no_reuse))) {
-		isl_union_map_free(access);
-		return r;
-	}
-
-	access = isl_union_map_apply_domain(access,
-					isl_union_map_copy(data->thread_sched));
-
-	acc = isl_map_from_union_map(access);
-
-	if (!force_private && !access_is_bijective(data, acc)) {
-		isl_map_free(acc);
-		return 0;
-	}
-
-	unique_depth = compute_accessed_by_single_thread_depth(data, acc);
-
-	acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization));
-	acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth,
-								data->n_thread);
-	requires_unroll = check_requires_unroll(data, acc, force_private);
-	if (unique_depth < 0 || requires_unroll < 0 ||
-	    (requires_unroll && kernel->any_force_private)) {
-		isl_map_free(acc);
-		return requires_unroll < 0 ? -1 : 0;
-	}
-
-	group->private_tile = gpu_array_tile_create(ctx, n_index);
-	if (!group->private_tile) {
-		isl_map_free(acc);
-		return -1;
-	}
-	group->private_tile->requires_unroll = requires_unroll;
-	if (!can_tile(acc, group->private_tile))
-		group->private_tile = gpu_array_tile_free(group->private_tile);
-
-	isl_map_free(acc);
-
-	if (group->private_tile) {
-		struct gpu_array_tile *tile = group->private_tile;
-		int tile_depth = compute_tile_depth(data, tile);
-		if (tile_depth < unique_depth)
-			tile_depth = unique_depth;
-		if (tile_adjust_depth(tile, tile_depth) < 0)
-			return -1;
-	}
-
-	if (force_private && !group->private_tile)
-		isl_die(ctx, isl_error_internal,
-			"unable to map array reference group to registers",
-			return -1);
-
-	return 0;
-}
-
-/* Compute the private and/or shared memory tiles for the array
- * reference group "group" of array "array" and set the tile depth.
- * Return 0 on success and -1 on error.
- */
-static int compute_group_bounds(struct ppcg_kernel *kernel,
-	struct gpu_array_ref_group *group, struct gpu_group_data *data)
-{
-	if (!group)
-		return -1;
-	if (compute_group_bounds_core(kernel, group, data) < 0)
-		return -1;
-	if (set_depth(data, group) < 0)
-		return -1;
-
-	return 0;
-}
-
-/* If two groups have overlapping access relations (as determined by
- * the "overlap" function) and if one of them involves a write,
- * then merge the two groups into one.
- * If "compute_bounds" is set, then call compute_group_bounds
- * on the merged groups.
- *
- * Return the updated number of groups.
- * Return -1 on error.
- */
-static int group_writes(struct ppcg_kernel *kernel,
-	int n, struct gpu_array_ref_group **groups,
-	int (*overlap)(struct gpu_array_ref_group *group1,
-		struct gpu_array_ref_group *group2), int compute_bounds,
-	struct gpu_group_data *data)
-{
-	int i, j;
-
-	for (i = 0; i < n; ++i) {
-		for (j = n - 1; j > i; --j) {
-			if (!groups[i]->write && !groups[j]->write)
-				continue;
-
-			if (!overlap(groups[i], groups[j]))
-				continue;
-
-			groups[i] = join_groups_and_free(groups[i], groups[j]);
-			if (j != n - 1)
-				groups[j] = groups[n - 1];
-			groups[n - 1] = NULL;
-			n--;
-
-			if (!groups[i])
-				return -1;
-			if (compute_bounds &&
-			    compute_group_bounds(kernel, groups[i], data) < 0)
-				return -1;
-		}
-	}
-
-	return n;
-}
-
-/* If two groups have overlapping access relations (within the innermost
- * loop) and if one of them involves a write, then merge the two groups
- * into one.
- *
- * Return the updated number of groups.
- */
-static int group_overlapping_writes(struct ppcg_kernel *kernel,
-	int n, struct gpu_array_ref_group **groups,
-	struct gpu_group_data *data)
-{
-	return group_writes(kernel, n, groups, &accesses_overlap, 0, data);
-}
-
-/* Check if the access relations of group1 and group2 overlap within
- * the outermost min(group1->min_depth, group2->min_depth) loops.
- */
-static int depth_accesses_overlap(struct gpu_array_ref_group *group1,
-	struct gpu_array_ref_group *group2)
-{
-	int depth;
-	int dim;
-	int empty;
-	isl_map *map_i, *map_j, *map;
-
-	depth = group1->min_depth;
-	if (group2->min_depth < depth)
-		depth = group2->min_depth;
-	map_i = isl_map_copy(group1->access);
-	dim = isl_map_dim(map_i, isl_dim_in);
-	map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth);
-	map_j = isl_map_copy(group2->access);
-	map_j = isl_map_eliminate(map_j, isl_dim_in, depth, dim - depth);
-	map = isl_map_intersect(map_i, map_j);
-	empty = isl_map_is_empty(map);
-	isl_map_free(map);
-
-	return !empty;
-}
-
-/* If two groups have overlapping access relations (within the outer
- * depth loops) and if one of them involves a write,
- * then merge the two groups into one.
- *
- * Return the updated number of groups.
- */
-static int group_depth_overlapping_writes(struct ppcg_kernel *kernel,
-	int n, struct gpu_array_ref_group **groups, struct gpu_group_data *data)
-{
-	return group_writes(kernel, n, groups, &depth_accesses_overlap, 1,
-				data);
-}
-
-/* Is the size of the tile specified by "tile" smaller than the sum of
- * the sizes of the tiles specified by "tile1" and "tile2"?
- */
-static int smaller_tile(struct gpu_array_tile *tile,
-	struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
-{
-	int smaller;
-	isl_val *size, *size1, *size2;
-
-	size = gpu_array_tile_size(tile);
-	size1 = gpu_array_tile_size(tile1);
-	size2 = gpu_array_tile_size(tile2);
-
-	size = isl_val_sub(size, size1);
-	size = isl_val_sub(size, size2);
-	smaller = isl_val_is_neg(size);
-
-	isl_val_free(size);
-
-	return smaller;
-}
-
-/* Given an initial grouping of array references and shared memory tiles
- * for each group that allows for a shared memory tile, merge two groups
- * if both have a shared memory tile, the merged group also has
- * a shared memory tile and the size of the tile for the merge group
- * is smaller than the sum of the tile sizes of the individual groups.
- *
- * If merging two groups decreases the depth of the tile of
- * one or both of the two groups, then we need to check for overlapping
- * writes again.
- *
- * Return the number of groups after merging.
- * Return -1 on error.
- */
-static int group_common_shared_memory_tile(struct ppcg_kernel *kernel,
-	struct gpu_array_info *array, int n,
-	struct gpu_array_ref_group **groups, struct gpu_group_data *data)
-{
-	int i, j;
-	int recompute_overlap = 0;
-
-	for (i = 0; i < n; ++i) {
-		if (!groups[i]->shared_tile)
-			continue;
-		for (j = n - 1; j > i; --j) {
-			struct gpu_array_ref_group *group;
-
-			if (!groups[j]->shared_tile)
-				continue;
-
-			if (!depth_accesses_overlap(groups[i], groups[j]))
-				continue;
-
-			group = join_groups(groups[i], groups[j]);
-			if (compute_group_bounds(kernel, group, data) < 0) {
-				gpu_array_ref_group_free(group);
-				return -1;
-			}
-			if (!group->shared_tile ||
-			    !smaller_tile(group->shared_tile,
-					groups[i]->shared_tile,
-					groups[j]->shared_tile)) {
-				gpu_array_ref_group_free(group);
-				continue;
-			}
-
-			if (group->min_depth < groups[i]->min_depth ||
-			    group->min_depth < groups[j]->min_depth)
-				recompute_overlap = 1;
-			gpu_array_ref_group_free(groups[i]);
-			gpu_array_ref_group_free(groups[j]);
-			groups[i] = group;
-			if (j != n - 1)
-				groups[j] = groups[n - 1];
-			n--;
-		}
-	}
-
-	if (recompute_overlap)
-		n = group_depth_overlapping_writes(kernel, n, groups, data);
-	return n;
-}
-
-/* Set array->n_group and array->groups to n and groups.
- *
- * Additionally, set the "nr" field of each group.
- */
-static void set_array_groups(struct gpu_local_array_info *array,
-	int n, struct gpu_array_ref_group **groups)
-{
-	int i;
-
-	array->n_group = n;
-	array->groups = groups;
-
-	for (i = 0; i < n; ++i)
-		groups[i]->nr = i;
-}
-
-/* Combine all groups in "groups" into a single group and return
- * the new number of groups (1 or 0 if there were no groups to start with).
- */
-static int join_all_groups(int n, struct gpu_array_ref_group **groups)
-{
-	int i;
-
-	for (i = n - 1; i > 0; --i) {
-		groups[0] = join_groups_and_free(groups[0], groups[i]);
-		groups[i] = NULL;
-		n--;
-	}
-
-	return n;
-}
-
-/* Group array references that should be considered together when
- * deciding whether to access them from private, shared or global memory.
- * Return -1 on error.
- *
- * In particular, if two array references overlap and if one of them
- * is a write, then the two references are grouped together.
- * We first perform an initial grouping based only on the access relation.
- * After computing shared and private memory tiles, we check for
- * overlapping writes again, but this time taking into account
- * the depth of the effective tile.
- *
- * Furthermore, if two groups admit a shared memory tile and if the
- * combination of the two also admits a shared memory tile, we merge
- * the two groups.
- *
- * If the array contains structures, then we compute a single
- * reference group without trying to find any tiles
- * since we do not map such arrays to private or shared
- * memory.  The only exception is when those arrays of structures
- * are required to be mapped to private memory.
- */
-static int group_array_references(struct ppcg_kernel *kernel,
-	struct gpu_local_array_info *local, struct gpu_group_data *data)
-{
-	int i;
-	int n;
-	isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched);
-	struct gpu_array_ref_group **groups;
-
-	groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
-					local->array->n_ref);
-	if (!groups)
-		return -1;
-
-	n = populate_array_references(local, groups, data);
-
-	if (local->array->has_compound_element && !local->force_private) {
-		n = join_all_groups(n, groups);
-		set_array_groups(local, n, groups);
-		return 0;
-	}
-
-	n = group_overlapping_writes(kernel, n, groups, data);
-
-	for (i = 0; i < n; ++i)
-		if (compute_group_bounds(kernel, groups[i], data) < 0)
-			n = -1;
-
-	n = group_depth_overlapping_writes(kernel, n, groups, data);
-
-	n = group_common_shared_memory_tile(kernel, local->array,
-					    n, groups, data);
-
-	set_array_groups(local, n, groups);
-
-	if (n >= 0)
-		return 0;
-
-	for (i = 0; i < local->array->n_ref; ++i)
-		gpu_array_ref_group_free(groups[i]);
-	return -1;
-}
-
-/* For each array in the input program that can be mapped to private memory,
- * check if there are any order dependences active inside the current kernel,
- * within the same iteration of the host schedule, i.e., the prefix
- * schedule at "node".
- * If so, mark the array as force_private so that its reference groups will be
- * mapped to a registers.
- *
- * Note that the arrays that cannot be mapped to private memory have
- * had their order dependences added to prog->array_order and
- * subsequently to the coincidence constraints.
- */
-static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel,
-	__isl_keep isl_schedule_node *node)
-{
-	int i;
-	isl_union_set *domain;
-	isl_multi_union_pw_aff *prefix;
-	isl_union_pw_multi_aff *contraction;
-
-	if (!kernel->options->live_range_reordering)
-		return;
-
-	kernel->any_force_private = 0;
-
-	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
-	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
-	prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
-								contraction);
-	domain = isl_union_set_copy(kernel->expanded_domain);
-	domain = isl_union_set_universe(domain);
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		struct gpu_local_array_info *local = &kernel->array[i];
-		isl_union_map *order;
-
-		local->force_private = 0;
-		if (!gpu_array_can_be_private(local->array))
-			continue;
-		order = isl_union_map_copy(local->array->dep_order);
-		order = isl_union_map_intersect_domain(order,
-						    isl_union_set_copy(domain));
-		order = isl_union_map_intersect_range(order,
-						    isl_union_set_copy(domain));
-		order = isl_union_map_eq_at_multi_union_pw_aff(order,
-					isl_multi_union_pw_aff_copy(prefix));
-		if (!isl_union_map_is_empty(order)) {
-			local->force_private = 1;
-			kernel->any_force_private = 1;
-		}
-		isl_union_map_free(order);
-	}
-
-	isl_multi_union_pw_aff_free(prefix);
-	isl_union_set_free(domain);
-}
-
-/* Expand the domain of the schedule "s" by plugging in
- * the contraction "contraction" and return the result.
- */
-static __isl_give isl_union_map *expand(__isl_take isl_union_map *s,
-	__isl_keep isl_union_pw_multi_aff *contraction)
-{
-	contraction = isl_union_pw_multi_aff_copy(contraction);
-	s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction);
-	return s;
-}
-
-/* Create a set of dimension data->thread_depth + data->n_thread
- * that equates the residue of the final data->n_thread dimensions
- * modulo the kernel->block_dim sizes to the thread identifiers.
- * Store the computed set in data->privatization.
- *
- * The construction starts with the space of kernel->thread_filter,
- * which is known to reference all thread identifiers.
- */
-static void compute_privatization(struct gpu_group_data *data,
-	struct ppcg_kernel *kernel)
-{
-	int i;
-	isl_ctx *ctx;
-	isl_space *space;
-	isl_local_space *ls;
-	isl_set *set;
-
-	ctx = isl_union_map_get_ctx(data->shared_sched);
-	space = isl_union_set_get_space(kernel->thread_filter);
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set,
-				    data->thread_depth + data->n_thread);
-	set = isl_set_universe(space);
-	space = isl_set_get_space(set);
-	ls = isl_local_space_from_space(space);
-
-	for (i = 0; i < data->n_thread; ++i) {
-		isl_aff *aff, *aff2;
-		isl_constraint *c;
-		isl_val *v;
-		isl_id *id;
-		int pos;
-
-		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
-					isl_dim_set, data->thread_depth + i);
-		v = isl_val_int_from_si(ctx, kernel->block_dim[i]);
-		aff = isl_aff_mod_val(aff, v);
-		id = isl_id_list_get_id(kernel->thread_ids, i);
-		pos = isl_set_find_dim_by_id(set, isl_dim_param, id);
-		isl_id_free(id);
-		aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls),
-					isl_dim_param, pos);
-		aff = isl_aff_sub(aff, aff2);
-		c = isl_equality_from_aff(aff);
-		set = isl_set_add_constraint(set, c);
-	}
-
-	isl_local_space_free(ls);
-	data->privatization = set;
-}
-
-/* Return the prefix schedule at "node" as a relation
- * between domain elements and schedule dimensions after detecting
- * equalities in this relation.
- */
-static __isl_give isl_union_map *prefix_with_equalities(
-	__isl_keep isl_schedule_node *node)
-{
-	isl_union_map *schedule;
-
-	schedule = isl_schedule_node_get_prefix_schedule_relation(node);
-	schedule = isl_union_map_detect_equalities(schedule);
-
-	return schedule;
-}
-
-/* Group references of all arrays in "kernel".
- * "node" points to the kernel mark.
- * The mapping to shared memory in computed at the "shared" mark.
- *
- * We first extract all required schedule information into
- * a gpu_group_data structure and then consider each array
- * in turn.
- */
-int gpu_group_references(struct ppcg_kernel *kernel,
-	__isl_keep isl_schedule_node *node)
-{
-	int i;
-	int r = 0;
-	isl_union_pw_multi_aff *contraction;
-	struct gpu_group_data data;
-
-	check_can_be_private_live_ranges(kernel, node);
-
-	data.scop = kernel->prog->scop;
-
-	data.kernel_depth = isl_schedule_node_get_schedule_depth(node);
-	data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node);
-
-	node = isl_schedule_node_copy(node);
-	node = gpu_tree_move_down_to_shared(node, kernel->core);
-	data.shared_depth = isl_schedule_node_get_schedule_depth(node);
-	data.shared_sched = prefix_with_equalities(node);
-
-	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	node = isl_schedule_node_child(node, 0);
-	data.thread_depth = isl_schedule_node_get_schedule_depth(node);
-	data.n_thread = isl_schedule_node_band_n_member(node);
-	if (data.thread_depth == data.shared_depth)
-		data.copy_sched = isl_union_map_copy(data.shared_sched);
-	else
-		data.copy_sched = prefix_with_equalities(node);
-	data.thread_sched = isl_union_map_copy(data.copy_sched);
-	data.thread_sched = isl_union_map_flat_range_product(data.thread_sched,
-		isl_schedule_node_band_get_partial_schedule_union_map(node));
-	data.thread_sched = isl_union_map_detect_equalities(data.thread_sched);
-
-	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
-	data.host_sched = expand(data.host_sched, contraction);
-	data.shared_sched = expand(data.shared_sched, contraction);
-	if (data.thread_depth == data.shared_depth) {
-		isl_union_map_free(data.copy_sched);
-		data.copy_sched = isl_union_map_copy(data.shared_sched);
-	} else {
-		data.copy_sched = expand(data.copy_sched, contraction);
-	}
-	data.thread_sched = expand(data.thread_sched, contraction);
-	isl_union_pw_multi_aff_free(contraction);
-
-	node = isl_schedule_node_child(node, 0);
-	data.full_sched = isl_union_map_copy(data.thread_sched);
-	data.full_sched = isl_union_map_flat_range_product(data.full_sched,
-		isl_schedule_node_get_subtree_schedule_union_map(node));
-	isl_schedule_node_free(node);
-
-	compute_privatization(&data, kernel);
-
-	for (i = 0; i < kernel->n_array; ++i) {
-		r = group_array_references(kernel, &kernel->array[i], &data);
-		if (r < 0)
-			break;
-	}
-
-	isl_union_map_free(data.host_sched);
-	isl_union_map_free(data.shared_sched);
-	isl_union_map_free(data.copy_sched);
-	isl_union_map_free(data.thread_sched);
-	isl_union_map_free(data.full_sched);
-	isl_set_free(data.privatization);
-
-	return r;
-}
-
-/* Given a description of an array tile "tile" and the "space"
- *
- *	{ D -> A }
- *
- * where D represents the first tile->depth schedule dimensions
- * and A represents the array, construct an isl_multi_aff
- *
- *	{ [D[i] -> A[a]] -> A'[a'] }
- *
- * with A' a scaled down copy of A according to the shifts and strides
- * in "tile".  In particular,
- *
- *	a' = (a + shift(i))/stride
- *
- * "insert_array" represents
- *
- *	{ [D -> A] -> D }
- *
- * and is used to insert A into the domain of functions that only
- * reference D.
- */
-static __isl_give isl_multi_aff *strided_tile(
-	struct gpu_array_tile *tile, __isl_keep isl_space *space,
-	__isl_keep isl_multi_aff *insert_array)
-{
-	int i;
-	isl_ctx *ctx;
-	isl_multi_aff *shift;
-	isl_multi_val *stride;
-	isl_space *space2;
-	isl_local_space *ls;
-	isl_multi_aff *tiling;
-
-	ctx = isl_space_get_ctx(space);
-	space2 = isl_space_domain(isl_space_copy(space));
-	ls = isl_local_space_from_space(space2);
-	space2 = isl_space_range(isl_space_copy(space));
-	stride = isl_multi_val_zero(space2);
-	shift = isl_multi_aff_zero(isl_space_copy(space));
-
-	for (i = 0; i < tile->n; ++i) {
-		struct gpu_array_bound *bound = &tile->bound[i];
-		isl_val *stride_i;
-		isl_aff *shift_i;
-
-		if (tile->bound[i].shift) {
-			stride_i = isl_val_copy(bound->stride);
-			shift_i = isl_aff_copy(bound->shift);
-		} else {
-			stride_i = isl_val_one(ctx);
-			shift_i = isl_aff_zero_on_domain(
-					isl_local_space_copy(ls));
-		}
-
-		stride = isl_multi_val_set_val(stride, i, stride_i);
-		shift = isl_multi_aff_set_aff(shift, i, shift_i);
-	}
-	isl_local_space_free(ls);
-
-	shift = isl_multi_aff_pullback_multi_aff(shift,
-				    isl_multi_aff_copy(insert_array));
-
-	tiling = isl_multi_aff_range_map(isl_space_copy(space));
-	tiling = isl_multi_aff_add(tiling, shift);
-	tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
-
-	return tiling;
-}
-
-/* Compute a tiling for the array reference group "group".
- *
- * The tiling is of the form
- *
- *	{ [D[i] -> A[a]] -> T[t] }
- *
- * where D represents the first tile->depth schedule dimensions,
- * A represents the global array and T represents the shared or
- * private memory tile.  The name of T is the name of the local
- * array.
- *
- * If there is any stride in the accesses, then the mapping is
- *
- *	t = (a + shift(i))/stride - lb(i)
- *
- * otherwise, it is simply
- *
- *	t = a - lb(i)
- */
-void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group)
-{
-	int i;
-	struct gpu_array_tile *tile;
-	isl_space *space;
-	isl_multi_aff *tiling, *lb, *insert_array;
-	isl_printer *p;
-	char *local_name;
-
-	tile = gpu_array_ref_group_tile(group);
-	if (!tile)
-		return;
-
-	space = isl_map_get_space(group->access);
-	space = isl_space_from_range(isl_space_range(space));
-	space = isl_space_add_dims(space, isl_dim_in, tile->depth);
-	insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
-
-	for (i = 0; i < tile->n; ++i)
-		if (tile->bound[i].shift)
-			break;
-
-	if (i < tile->n)
-		tiling = strided_tile(tile, space, insert_array);
-	else
-		tiling = isl_multi_aff_range_map(isl_space_copy(space));
-
-	lb = isl_multi_aff_zero(space);
-	for (i = 0; i < tile->n; ++i) {
-		isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
-		lb = isl_multi_aff_set_aff(lb, i, lb_i);
-	}
-	lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
-
-	tiling = isl_multi_aff_sub(tiling, lb);
-
-	p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
-	p = gpu_array_ref_group_print_name(group, p);
-	local_name = isl_printer_get_str(p);
-	isl_printer_free(p);
-	tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
-	free(local_name);
-
-	tile->tiling = tiling;
-}
diff --git a/polly/lib/External/ppcg/gpu_hybrid.h b/polly/lib/External/ppcg/gpu_hybrid.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_hybrid.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef GPU_HYBRID_H
-#define GPU_HYBRID_H
-
-#include <isl/schedule_node.h>
-
-#include "gpu.h"
-#include "hybrid.h"
-
-__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
-	int *tile_sizes);
-
-#endif
diff --git a/polly/lib/External/ppcg/gpu_hybrid.c b/polly/lib/External/ppcg/gpu_hybrid.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_hybrid.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright 2013      Ecole Normale Superieure
- * Copyright 2015      Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <string.h>
-
-#include <isl/val.h>
-#include <isl/space.h>
-#include <isl/union_set.h>
-#include <isl/schedule_node.h>
-
-#include "hybrid.h"
-#include "gpu_hybrid.h"
-#include "gpu_tree.h"
-#include "schedule.h"
-#include "util.h"
-
-/* Have all domain elements been filtered out before reaching
- * the "node" position in the schedule tree?
- */
-static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node)
-{
-	isl_union_set *domain;
-	isl_bool empty;
-
-	domain = isl_schedule_node_get_domain(node);
-	empty = isl_union_set_is_empty(domain);
-	isl_union_set_free(domain);
-
-	return empty;
-}
-
-/* Given a pointer to a phase in the result of hybrid tiling,
- * map the phase to the device, provided the phase is non-empty.
- * Empty phases can occur if the input schedule domain can be
- * covered by a small number of hexagons that all belong to the same phase.
- *
- * The input has the following form:
- *
- *	M - CT - P - C - ...
- *
- * with M the phase marker, CT the space tiling, P the original
- * parent band and C the original child band.
- * The (outer dimensions of the) C band need to be mapped to threads.
- * The (outer dimension of the) CT band needs to be mapped to blocks.
- * The mapping to shared memory needs to be computed between the CT and
- * the P band.
- *
- * The C band is first shifted to start at zero.
- * Then the appropriate markers are introduced and a kernel is
- * created for the tree rooted at CT.
- * If the "unroll_gpu_tile" option is set, then the AST generator
- * is instructed to unroll the P and C bands.
- */
-static __isl_give isl_schedule_node *update_phase(
-	__isl_take isl_schedule_node *node, void *user)
-{
-	struct gpu_gen *gen = user;
-	int depth0, depth;
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_bool empty_domain;
-	ppcg_ht_phase *phase;
-
-	empty_domain = has_empty_domain(node);
-	if (empty_domain < 0)
-		return isl_schedule_node_free(node);
-	if (empty_domain)
-		return node;
-
-	if (!node)
-		return NULL;
-	ctx = isl_schedule_node_get_ctx(node);
-
-	phase = ppcg_ht_phase_extract_from_mark(node);
-
-	depth0 = isl_schedule_node_get_tree_depth(node);
-
-	node = isl_schedule_node_child(node, 0);
-
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_child(node, 0);
-	node = ppcg_ht_phase_shift_space_point(phase, node);
-	if (gen->options->unroll_gpu_tile)
-		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
-	id = isl_id_alloc(ctx, "thread", NULL);
-	node = isl_schedule_node_insert_mark(node, id);
-	node = isl_schedule_node_parent(node);
-	if (gen->options->unroll_gpu_tile)
-		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
-	id = isl_id_alloc(ctx, "shared", NULL);
-	node = isl_schedule_node_insert_mark(node, id);
-	node = isl_schedule_node_parent(node);
-
-	node = gpu_create_kernel(gen, node, 0, NULL);
-
-	depth = isl_schedule_node_get_tree_depth(node);
-	node = isl_schedule_node_ancestor(node, depth - depth0);
-
-	return node;
-}
-
-/* Apply hybrid tiling on "node" and its parent based on the (valid)
- * bounds on the relative dependence distances "bounds" and
- * the tile sizes in "tile_sizes".
- * The number of elements in "tile_sizes" is at least as large
- * as the sum of the dimensions of the parent and the child node.
- *
- * Convert the tile_sizes to an isl_multi_val in the right space,
- * insert the hybrid tiling and then create a kernel inside each phase.
- * Finally, remove the phase marks.
- */
-__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
-	int *tile_sizes)
-{
-	isl_multi_val *mv;
-	isl_space *space, *space2;
-
-	if (!node || !bounds)
-		goto error;
-
-	space2 = isl_schedule_node_band_get_space(node);
-	node = isl_schedule_node_parent(node);
-	space = isl_schedule_node_band_get_space(node);
-	space = isl_space_product(space, space2);
-	mv = ppcg_multi_val_from_int_list(space, tile_sizes);
-
-	node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options);
-
-	node = hybrid_tile_foreach_phase(node, &update_phase, gen);
-
-	node = hybrid_tile_drop_phase_marks(node);
-
-	return node;
-error:
-	isl_schedule_node_free(node);
-	ppcg_ht_bounds_free(bounds);
-	return NULL;
-}
diff --git a/polly/lib/External/ppcg/gpu_print.h b/polly/lib/External/ppcg/gpu_print.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_print.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef GPU_PRINT_H
-#define GPU_PRINT_H
-
-#include "gpu.h"
-
-__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
-	struct gpu_prog *prog);
-
-__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
-	struct gpu_types *types, struct gpu_prog *prog);
-
-__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node);
-
-__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
-	struct gpu_array_info *array);
-__isl_give isl_printer *gpu_array_info_print_declaration_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array,
-	const char *memory_space);
-__isl_give isl_printer *gpu_array_info_print_call_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array);
-
-__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt);
-__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt);
-
-#endif
diff --git a/polly/lib/External/ppcg/gpu_print.c b/polly/lib/External/ppcg/gpu_print.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_print.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright 2012      Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
- */
-
-#include <string.h>
-
-#include <isl/aff.h>
-
-#include "gpu_print.h"
-#include "print.h"
-#include "schedule.h"
-
-/* Print declarations to "p" for arrays that are local to "prog"
- * but that are used on the host and therefore require a declaration.
- */
-__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	int i;
-
-	if (!prog)
-		return isl_printer_free(p);
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-		isl_ast_expr *size;
-
-		if (!array->declare_local)
-			continue;
-		size = array->declared_size;
-		p = ppcg_print_declaration_with_size(p, array->type, size);
-	}
-
-	return p;
-}
-
-/* Print an expression for the size of "array" in bytes.
- */
-__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
-	struct gpu_array_info *array)
-{
-	int i;
-
-	for (i = 0; i < array->n_index; ++i) {
-		isl_ast_expr *bound;
-
-		prn = isl_printer_print_str(prn, "(");
-		bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
-		prn = isl_printer_print_ast_expr(prn, bound);
-		isl_ast_expr_free(bound);
-		prn = isl_printer_print_str(prn, ") * ");
-	}
-	prn = isl_printer_print_str(prn, "sizeof(");
-	prn = isl_printer_print_str(prn, array->type);
-	prn = isl_printer_print_str(prn, ")");
-
-	return prn;
-}
-
-/* Print the declaration of a non-linearized array argument.
- */
-static __isl_give isl_printer *print_non_linearized_declaration_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array)
-{
-	p = isl_printer_print_str(p, array->type);
-	p = isl_printer_print_str(p, " ");
-
-	p = isl_printer_print_ast_expr(p, array->bound_expr);
-
-	return p;
-}
-
-/* Print the declaration of an array argument.
- * "memory_space" allows to specify a memory space prefix.
- */
-__isl_give isl_printer *gpu_array_info_print_declaration_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array,
-	const char *memory_space)
-{
-	if (gpu_array_is_read_only_scalar(array)) {
-		p = isl_printer_print_str(p, array->type);
-		p = isl_printer_print_str(p, " ");
-		p = isl_printer_print_str(p, array->name);
-		return p;
-	}
-
-	if (memory_space) {
-		p = isl_printer_print_str(p, memory_space);
-		p = isl_printer_print_str(p, " ");
-	}
-
-	if (array->n_index != 0 && !array->linearize)
-		return print_non_linearized_declaration_argument(p, array);
-
-	p = isl_printer_print_str(p, array->type);
-	p = isl_printer_print_str(p, " ");
-	p = isl_printer_print_str(p, "*");
-	p = isl_printer_print_str(p, array->name);
-
-	return p;
-}
-
-/* Print the call of an array argument.
- */
-__isl_give isl_printer *gpu_array_info_print_call_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array)
-{
-	if (gpu_array_is_read_only_scalar(array))
-		return isl_printer_print_str(p, array->name);
-
-	p = isl_printer_print_str(p, "dev_");
-	p = isl_printer_print_str(p, array->name);
-
-	return p;
-}
-
-/* Print an access to the element in the private/shared memory copy
- * described by "stmt".  The index of the copy is recorded in
- * stmt->local_index as an access to the array.
- */
-static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	return isl_printer_print_ast_expr(p, stmt->u.c.local_index);
-}
-
-/* Print an access to the element in the global memory copy
- * described by "stmt".  The index of the copy is recorded in
- * stmt->index as an access to the array.
- */
-static __isl_give isl_printer *stmt_print_global_index(
-	__isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
-{
-	struct gpu_array_info *array = stmt->u.c.array;
-	isl_ast_expr *index;
-
-	if (gpu_array_is_scalar(array)) {
-		if (!gpu_array_is_read_only_scalar(array))
-			p = isl_printer_print_str(p, "*");
-		p = isl_printer_print_str(p, array->name);
-		return p;
-	}
-
-	index = isl_ast_expr_copy(stmt->u.c.index);
-
-	p = isl_printer_print_ast_expr(p, index);
-	isl_ast_expr_free(index);
-
-	return p;
-}
-
-/* Print a copy statement.
- *
- * A read copy statement is printed as
- *
- *	local = global;
- *
- * while a write copy statement is printed as
- *
- *	global = local;
- */
-__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	p = isl_printer_start_line(p);
-	if (stmt->u.c.read) {
-		p = stmt_print_local_index(p, stmt);
-		p = isl_printer_print_str(p, " = ");
-		p = stmt_print_global_index(p, stmt);
-	} else {
-		p = stmt_print_global_index(p, stmt);
-		p = isl_printer_print_str(p, " = ");
-		p = stmt_print_local_index(p, stmt);
-	}
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);
-}
-
-/* This function is called for each node in a GPU AST.
- * In case of a user node, print the macro definitions required
- * for printing the AST expressions in the annotation, if any.
- * For other nodes, return true such that descendants are also
- * visited.
- *
- * In particular, for a kernel launch, print the macro definitions
- * needed for the grid size.
- * For a copy statement, print the macro definitions needed
- * for the two index expressions.
- * For an original user statement, print the macro definitions
- * needed for the substitutions.
- */
-static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
-{
-	const char *name;
-	isl_id *id;
-	int is_kernel;
-	struct ppcg_kernel *kernel;
-	struct ppcg_kernel_stmt *stmt;
-	isl_printer **p = user;
-
-	if (isl_ast_node_get_type(node) != isl_ast_node_user)
-		return isl_bool_true;
-
-	id = isl_ast_node_get_annotation(node);
-	if (!id)
-		return isl_bool_false;
-
-	name = isl_id_get_name(id);
-	if (!name)
-		return isl_bool_error;
-	is_kernel = !strcmp(name, "kernel");
-	kernel = is_kernel ? isl_id_get_user(id) : NULL;
-	stmt = is_kernel ? NULL : isl_id_get_user(id);
-	isl_id_free(id);
-
-	if ((is_kernel && !kernel) || (!is_kernel && !stmt))
-		return isl_bool_error;
-
-	if (is_kernel) {
-		*p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p);
-	} else if (stmt->type == ppcg_kernel_copy) {
-		*p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p);
-		*p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p);
-	} else if (stmt->type == ppcg_kernel_domain) {
-		*p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr);
-	}
-	if (!*p)
-		return isl_bool_error;
-
-	return isl_bool_false;
-}
-
-/* Print the required macros for the GPU AST "node" to "p",
- * including those needed for the user statements inside the AST.
- */
-__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node)
-{
-	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
-		return isl_printer_free(p);
-	p = ppcg_print_macros(p, node);
-	return p;
-}
-
-/* Was the definition of "type" printed before?
- * That is, does its name appear in the list of printed types "types"?
- */
-static int already_printed(struct gpu_types *types,
-	struct pet_type *type)
-{
-	int i;
-
-	for (i = 0; i < types->n; ++i)
-		if (!strcmp(types->name[i], type->name))
-			return 1;
-
-	return 0;
-}
-
-/* Print the definitions of all types prog->scop that have not been
- * printed before (according to "types") on "p".
- * Extend the list of printed types "types" with the newly printed types.
- */
-__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
-	struct gpu_types *types, struct gpu_prog *prog)
-{
-	int i, n;
-	isl_ctx *ctx;
-	char **name;
-
-	n = prog->scop->pet->n_type;
-
-	if (n == 0)
-		return p;
-
-	ctx = isl_printer_get_ctx(p);
-	name = isl_realloc_array(ctx, types->name, char *, types->n + n);
-	if (!name)
-		return isl_printer_free(p);
-	types->name = name;
-
-	for (i = 0; i < n; ++i) {
-		struct pet_type *type = prog->scop->pet->types[i];
-
-		if (already_printed(types, type))
-			continue;
-
-		p = isl_printer_start_line(p);
-		p = isl_printer_print_str(p, type->definition);
-		p = isl_printer_print_str(p, ";");
-		p = isl_printer_end_line(p);
-
-		types->name[types->n++] = strdup(type->name);
-	}
-
-	return p;
-}
diff --git a/polly/lib/External/ppcg/gpu_tree.h b/polly/lib/External/ppcg/gpu_tree.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_tree.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef GPU_TREE_H
-#define GPU_TREE_H
-
-#include <isl/schedule_node.h>
-
-#include "gpu.h"
-
-__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
-	__isl_take isl_schedule_node *node);
-int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node);
-__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
-__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
-	__isl_take isl_schedule_node *node);
-__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
-__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
-	__isl_take isl_schedule_node *node);
-__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
-	__isl_take isl_schedule_node *node, int depth,
-	__isl_keep isl_union_set *core);
-
-int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-
-#endif
diff --git a/polly/lib/External/ppcg/gpu_tree.c b/polly/lib/External/ppcg/gpu_tree.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/gpu_tree.c
+++ /dev/null
@@ -1,640 +0,0 @@
-/*
- * Copyright 2013      Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <string.h>
-
-#include <isl/set.h>
-#include <isl/union_set.h>
-#include <isl/space.h>
-
-#include "gpu_tree.h"
-
-/* The functions in this file are used to navigate part of a schedule tree
- * that is mapped to blocks.  Initially, this part consists of a linear
- * branch segment with a mark node with name "kernel" on the outer end
- * and a mark node with name "thread" on the inner end.
- * During the mapping to blocks, branching may be introduced, but only
- * one of the elements in each sequence contains the "thread" mark.
- * The filter of this element (and only this filter) contains
- * domain elements identified by the "core" argument of the functions
- * that move down this tree.
- *
- * Synchronization statements have a name that starts with "sync" and
- * a user pointer pointing to the kernel that contains the synchronization.
- * The functions inserting or detecting synchronizations take a ppcg_kernel
- * argument to be able to create or identify such statements.
- * They may also use two fields in this structure, the "core" field
- * to move around in the tree and the "n_sync" field to make sure that
- * each synchronization has a different name (within the kernel).
- */
-
-/* Is "node" a mark node with an identifier called "name"?
- */
-static int is_marked(__isl_keep isl_schedule_node *node, const char *name)
-{
-	isl_id *mark;
-	int has_name;
-
-	if (!node)
-		return -1;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
-		return 0;
-
-	mark = isl_schedule_node_mark_get_id(node);
-	if (!mark)
-		return -1;
-
-	has_name = !strcmp(isl_id_get_name(mark), name);
-	isl_id_free(mark);
-
-	return has_name;
-}
-
-/* Is "node" a mark node with an identifier called "kernel"?
- */
-int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node)
-{
-	return is_marked(node, "kernel");
-}
-
-/* Is "node" a mark node with an identifier called "shared"?
- */
-static int node_is_shared(__isl_keep isl_schedule_node *node)
-{
-	return is_marked(node, "shared");
-}
-
-/* Is "node" a mark node with an identifier called "thread"?
- */
-static int node_is_thread(__isl_keep isl_schedule_node *node)
-{
-	return is_marked(node, "thread");
-}
-
-/* Insert a mark node with identifier "shared" in front of "node".
- */
-static __isl_give isl_schedule_node *insert_shared(
-	__isl_take isl_schedule_node *node)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-
-	ctx = isl_schedule_node_get_ctx(node);
-	id = isl_id_alloc(ctx, "shared", NULL);
-	node = isl_schedule_node_insert_mark(node, id);
-
-	return node;
-}
-
-/* Insert a "shared" mark in front of the "thread" mark
- * provided the linear branch between "node" and the "thread" mark
- * does not contain such a "shared" mark already.
- *
- * As a side effect, this function checks that the subtree at "node"
- * actually contains a "thread" mark and that there is no branching
- * in between "node" and this "thread" mark.
- */
-__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
-	__isl_take isl_schedule_node *node)
-{
-	int depth0, depth;
-	int any_shared = 0;
-
-	if (!node)
-		return NULL;
-
-	depth0 = isl_schedule_node_get_tree_depth(node);
-
-	for (;;) {
-		int is_thread;
-		int n;
-
-		if (!any_shared) {
-			any_shared = node_is_shared(node);
-			if (any_shared < 0)
-				return isl_schedule_node_free(node);
-		}
-		is_thread = node_is_thread(node);
-		if (is_thread < 0)
-			return isl_schedule_node_free(node);
-		if (is_thread)
-			break;
-		n = isl_schedule_node_n_children(node);
-		if (n == 0)
-			isl_die(isl_schedule_node_get_ctx(node),
-				isl_error_invalid,
-				"no thread marker found",
-				return isl_schedule_node_free(node));
-		if (n > 1)
-			isl_die(isl_schedule_node_get_ctx(node),
-				isl_error_invalid,
-				"expecting single thread marker",
-				return isl_schedule_node_free(node));
-
-		node = isl_schedule_node_child(node, 0);
-	}
-
-	if (!any_shared)
-		node = insert_shared(node);
-	depth = isl_schedule_node_get_tree_depth(node);
-	node = isl_schedule_node_ancestor(node, depth - depth0);
-
-	return node;
-}
-
-/* Assuming "node" is a filter node, does it correspond to the branch
- * that contains the "thread" mark, i.e., does it contain any elements
- * in "core"?
- */
-static int node_is_core(__isl_keep isl_schedule_node *node,
-	__isl_keep isl_union_set *core)
-{
-	int disjoint;
-	isl_union_set *filter;
-
-	filter = isl_schedule_node_filter_get_filter(node);
-	disjoint = isl_union_set_is_disjoint(filter, core);
-	isl_union_set_free(filter);
-	if (disjoint < 0)
-		return -1;
-
-	return !disjoint;
-}
-
-/* Move to the only child of "node" that has the "thread" mark as descendant,
- * where the branch containing this mark is identified by the domain elements
- * in "core".
- *
- * If "node" is not a sequence, then it only has one child and we move
- * to that single child.
- * Otherwise, we check each of the filters in the children, pick
- * the one that corresponds to "core" and return a pointer to the child
- * of the filter node.
- */
-static __isl_give isl_schedule_node *core_child(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
-{
-	int i, n;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
-		return isl_schedule_node_child(node, 0);
-
-	n = isl_schedule_node_n_children(node);
-	for (i = 0; i < n; ++i) {
-		int is_core;
-
-		node = isl_schedule_node_child(node, i);
-		is_core = node_is_core(node, core);
-
-		if (is_core < 0)
-			return isl_schedule_node_free(node);
-		if (is_core)
-			return isl_schedule_node_child(node, 0);
-
-		node = isl_schedule_node_parent(node);
-	}
-
-	isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
-		"core child not found", return isl_schedule_node_free(node));
-}
-
-/* Move down the branch between "kernel" and "thread" until
- * the "shared" mark is reached, where the branch containing the "shared"
- * mark is identified by the domain elements in "core".
- */
-__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
-{
-	int is_shared;
-
-	while ((is_shared = node_is_shared(node)) == 0)
-		node = core_child(node, core);
-	if (is_shared < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move down the branch between "kernel" and "thread" until
- * the "thread" mark is reached, where the branch containing the "thread"
- * mark is identified by the domain elements in "core".
- */
-__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
-{
-	int is_thread;
-
-	while ((is_thread = node_is_thread(node)) == 0)
-		node = core_child(node, core);
-	if (is_thread < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move up the tree underneath the "thread" mark until
- * the "thread" mark is reached.
- */
-__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
-	__isl_take isl_schedule_node *node)
-{
-	int is_thread;
-
-	while ((is_thread = node_is_thread(node)) == 0)
-		node = isl_schedule_node_parent(node);
-	if (is_thread < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move up the tree underneath the "kernel" mark until
- * the "kernel" mark is reached.
- */
-__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
-	__isl_take isl_schedule_node *node)
-{
-	int is_kernel;
-
-	while ((is_kernel = gpu_tree_node_is_kernel(node)) == 0)
-		node = isl_schedule_node_parent(node);
-	if (is_kernel < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move down from the "kernel" mark (or at least a node with schedule
- * depth smaller than or equal to "depth") to a band node at schedule
- * depth "depth".  The "thread" mark is assumed to have a schedule
- * depth greater than or equal to "depth".  The branch containing the
- * "thread" mark is identified by the domain elements in "core".
- *
- * If the desired schedule depth is in the middle of band node,
- * then the band node is split into two pieces, the second piece
- * at the desired schedule depth.
- */
-__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
-	__isl_take isl_schedule_node *node, int depth,
-	__isl_keep isl_union_set *core)
-{
-	int is_shared;
-	int is_thread = 0;
-
-	while (node && isl_schedule_node_get_schedule_depth(node) < depth) {
-		if (isl_schedule_node_get_type(node) ==
-						    isl_schedule_node_band) {
-			int node_depth, node_dim;
-			node_depth = isl_schedule_node_get_schedule_depth(node);
-			node_dim = isl_schedule_node_band_n_member(node);
-			if (node_depth + node_dim > depth)
-				node = isl_schedule_node_band_split(node,
-							depth - node_depth);
-		}
-		node = core_child(node, core);
-	}
-	while ((is_shared = node_is_shared(node)) == 0 &&
-	    (is_thread = node_is_thread(node)) == 0 &&
-	    isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		node = core_child(node, core);
-	if (is_shared < 0 || is_thread < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Create a union set containing a single set with a tuple identifier
- * called "syncX" and user pointer equal to "kernel".
- */
-static __isl_give isl_union_set *create_sync_domain(struct ppcg_kernel *kernel)
-{
-	isl_space *space;
-	isl_id *id;
-	char name[40];
-
-	space = isl_space_set_alloc(kernel->ctx, 0, 0);
-	snprintf(name, sizeof(name), "sync%d", kernel->n_sync++);
-	id = isl_id_alloc(kernel->ctx, name, kernel);
-	space = isl_space_set_tuple_id(space, isl_dim_set, id);
-	return isl_union_set_from_set(isl_set_universe(space));
-}
-
-/* Is "id" the identifier of a synchronization statement inside "kernel"?
- * That is, does its name start with "sync" and does it point to "kernel"?
- */
-int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel)
-{
-	const char *name;
-
-	name = isl_id_get_name(id);
-	if (!name)
-		return 0;
-	else if (strncmp(name, "sync", 4))
-		return 0;
-	return isl_id_get_user(id) == kernel;
-}
-
-/* Does "domain" consist of a single set with a tuple identifier
- * corresponding to a synchronization for "kernel"?
- */
-static int domain_is_sync(__isl_keep isl_union_set *domain,
-	struct ppcg_kernel *kernel)
-{
-	int is_sync;
-	isl_id *id;
-	isl_set *set;
-
-	if (isl_union_set_n_set(domain) != 1)
-		return 0;
-	set = isl_set_from_union_set(isl_union_set_copy(domain));
-	id = isl_set_get_tuple_id(set);
-	is_sync = gpu_tree_id_is_sync(id, kernel);
-	isl_id_free(id);
-	isl_set_free(set);
-
-	return is_sync;
-}
-
-/* Does "node" point to a filter selecting a synchronization statement
- * for "kernel"?
- */
-static int node_is_sync_filter(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int is_sync;
-	enum isl_schedule_node_type type;
-	isl_union_set *domain;
-
-	if (!node)
-		return -1;
-	type = isl_schedule_node_get_type(node);
-	if (type != isl_schedule_node_filter)
-		return 0;
-	domain = isl_schedule_node_filter_get_filter(node);
-	is_sync = domain_is_sync(domain, kernel);
-	isl_union_set_free(domain);
-
-	return is_sync;
-}
-
-/* Is "node" part of a sequence with a previous synchronization statement
- * for "kernel"?
- * That is, is the parent of "node" a filter such that there is
- * a previous filter that picks out exactly such a synchronization statement?
- */
-static int has_preceding_sync(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int found = 0;
-
-	node = isl_schedule_node_copy(node);
-	node = isl_schedule_node_parent(node);
-	while (!found && isl_schedule_node_has_previous_sibling(node)) {
-		node = isl_schedule_node_previous_sibling(node);
-		if (!node)
-			break;
-		found = node_is_sync_filter(node, kernel);
-	}
-	if (!node)
-		found = -1;
-	isl_schedule_node_free(node);
-
-	return found;
-}
-
-/* Is "node" part of a sequence with a subsequent synchronization statement
- * for "kernel"?
- * That is, is the parent of "node" a filter such that there is
- * a subsequent filter that picks out exactly such a synchronization statement?
- */
-static int has_following_sync(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int found = 0;
-
-	node = isl_schedule_node_copy(node);
-	node = isl_schedule_node_parent(node);
-	while (!found && isl_schedule_node_has_next_sibling(node)) {
-		node = isl_schedule_node_next_sibling(node);
-		if (!node)
-			break;
-		found = node_is_sync_filter(node, kernel);
-	}
-	if (!node)
-		found = -1;
-	isl_schedule_node_free(node);
-
-	return found;
-}
-
-/* Does the subtree rooted at "node" (which is a band node) contain
- * any synchronization statement for "kernel" that precedes
- * the core computation of "kernel" (identified by the elements
- * in kernel->core)?
- */
-static int has_sync_before_core(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int has_sync = 0;
-	int is_thread;
-
-	node = isl_schedule_node_copy(node);
-	while ((is_thread = node_is_thread(node)) == 0) {
-		node = core_child(node, kernel->core);
-		has_sync = has_preceding_sync(node, kernel);
-		if (has_sync < 0 || has_sync)
-			break;
-	}
-	if (is_thread < 0 || !node)
-		has_sync = -1;
-	isl_schedule_node_free(node);
-
-	return has_sync;
-}
-
-/* Does the subtree rooted at "node" (which is a band node) contain
- * any synchronization statement for "kernel" that follows
- * the core computation of "kernel" (identified by the elements
- * in kernel->core)?
- */
-static int has_sync_after_core(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int has_sync = 0;
-	int is_thread;
-
-	node = isl_schedule_node_copy(node);
-	while ((is_thread = node_is_thread(node)) == 0) {
-		node = core_child(node, kernel->core);
-		has_sync = has_following_sync(node, kernel);
-		if (has_sync < 0 || has_sync)
-			break;
-	}
-	if (is_thread < 0 || !node)
-		has_sync = -1;
-	isl_schedule_node_free(node);
-
-	return has_sync;
-}
-
-/* Insert (or extend) an extension on top of "node" that puts
- * a synchronization node for "kernel" before "node".
- * Return a pointer to the original node in the updated schedule tree.
- */
-static __isl_give isl_schedule_node *insert_sync_before(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	isl_union_set *domain;
-	isl_schedule_node *graft;
-
-	if (!node)
-		return NULL;
-
-	domain = create_sync_domain(kernel);
-	graft = isl_schedule_node_from_domain(domain);
-	node = isl_schedule_node_graft_before(node, graft);
-
-	return node;
-}
-
-/* Insert (or extend) an extension on top of "node" that puts
- * a synchronization node for "kernel" afater "node".
- * Return a pointer to the original node in the updated schedule tree.
- */
-static __isl_give isl_schedule_node *insert_sync_after(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	isl_union_set *domain;
-	isl_schedule_node *graft;
-
-	if (!node)
-		return NULL;
-
-	domain = create_sync_domain(kernel);
-	graft = isl_schedule_node_from_domain(domain);
-	node = isl_schedule_node_graft_after(node, graft);
-
-	return node;
-}
-
-/* Insert an extension on top of "node" that puts a synchronization node
- * for "kernel" before "node" unless there already is
- * such a synchronization node.
- */
-__isl_give isl_schedule_node *gpu_tree_ensure_preceding_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-
-	has_sync = has_preceding_sync(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	return insert_sync_before(node, kernel);
-}
-
-/* Insert an extension on top of "node" that puts a synchronization node
- * for "kernel" after "node" unless there already is
- * such a synchronization node.
- */
-__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-
-	has_sync = has_following_sync(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	return insert_sync_after(node, kernel);
-}
-
-/* Insert an extension on top of "node" that puts a synchronization node
- * for "kernel" after "node" unless there already is such a sync node or
- * "node" itself already * contains a synchronization node following
- * the core computation of "kernel".
- */
-__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-
-	has_sync = has_sync_after_core(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	has_sync = has_following_sync(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	return insert_sync_after(node, kernel);
-}
-
-/* Move left in the sequence on top of "node" to a synchronization node
- * for "kernel".
- * If "node" itself contains a synchronization node preceding
- * the core computation of "kernel", then return "node" itself.
- * Otherwise, if "node" does not have a preceding synchronization node,
- * then create one first.
- */
-__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-	int is_sync;
-
-	has_sync = has_sync_before_core(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	node = gpu_tree_ensure_preceding_sync(node, kernel);
-	node = isl_schedule_node_parent(node);
-	while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
-		node = isl_schedule_node_previous_sibling(node);
-	if (is_sync < 0)
-		node = isl_schedule_node_free(node);
-	node = isl_schedule_node_child(node, 0);
-
-	return node;
-}
-
-/* Move right in the sequence on top of "node" to a synchronization node
- * for "kernel".
- * If "node" itself contains a synchronization node following
- * the core computation of "kernel", then return "node" itself.
- * Otherwise, if "node" does not have a following synchronization node,
- * then create one first.
- */
-__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-	int is_sync;
-
-	has_sync = has_sync_after_core(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	node = gpu_tree_ensure_following_sync(node, kernel);
-	node = isl_schedule_node_parent(node);
-	while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
-		node = isl_schedule_node_next_sibling(node);
-	if (is_sync < 0)
-		node = isl_schedule_node_free(node);
-	node = isl_schedule_node_child(node, 0);
-
-	return node;
-}
diff --git a/polly/lib/External/ppcg/grouping.c b/polly/lib/External/ppcg/grouping.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/grouping.c
+++ /dev/null
@@ -1,684 +0,0 @@
-/*
- * Copyright 2016      Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege.
- */
-
-#include <isl/ctx.h>
-#include <isl/id.h>
-#include <isl/val.h>
-#include <isl/space.h>
-#include <isl/aff.h>
-#include <isl/set.h>
-#include <isl/map.h>
-#include <isl/union_set.h>
-#include <isl/union_map.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-
-#include "ppcg.h"
-
-/* Internal data structure for use during the detection of statements
- * that can be grouped.
- *
- * "sc" contains the original schedule constraints (not a copy).
- * "dep" contains the intersection of the validity and the proximity
- * constraints in "sc".  It may be NULL if it has not been computed yet.
- * "group_id" is the identifier for the next group that is extracted.
- *
- * "domain" is the set of statement instances that belong to any of the groups.
- * "contraction" maps the elements of "domain" to the corresponding group
- * instances.
- * "schedule" schedules the statements in each group relatively to each other.
- * These last three fields are NULL if no groups have been found so far.
- */
-struct ppcg_grouping {
-	isl_schedule_constraints *sc;
-
-	isl_union_map *dep;
-	int group_id;
-
-	isl_union_set *domain;
-	isl_union_pw_multi_aff *contraction;
-	isl_schedule *schedule;
-};
-
-/* Clear all memory allocated by "grouping".
- */
-static void ppcg_grouping_clear(struct ppcg_grouping *grouping)
-{
-	isl_union_map_free(grouping->dep);
-	isl_union_set_free(grouping->domain);
-	isl_union_pw_multi_aff_free(grouping->contraction);
-	isl_schedule_free(grouping->schedule);
-}
-
-/* Compute the intersection of the proximity and validity dependences
- * in grouping->sc and store the result in grouping->dep, unless
- * this intersection has been computed before.
- */
-static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping)
-{
-	isl_union_map *validity, *proximity;
-
-	if (grouping->dep)
-		return isl_stat_ok;
-
-	validity = isl_schedule_constraints_get_validity(grouping->sc);
-	proximity = isl_schedule_constraints_get_proximity(grouping->sc);
-	grouping->dep = isl_union_map_intersect(validity, proximity);
-
-	if (!grouping->dep)
-		return isl_stat_error;
-
-	return isl_stat_ok;
-}
-
-/* Information extracted from one or more consecutive leaves
- * in the input schedule.
- *
- * "list" contains the sets of statement instances in the leaves,
- * one element in the list for each original leaf.
- * "domain" contains the union of the sets in "list".
- * "prefix" contains the prefix schedule of these elements.
- */
-struct ppcg_grouping_leaf {
-	isl_union_set *domain;
-	isl_union_set_list *list;
-	isl_multi_union_pw_aff *prefix;
-};
-
-/* Free all memory allocated for "leaves".
- */
-static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[])
-{
-	int i;
-
-	if (!leaves)
-		return;
-
-	for (i = 0; i < n; ++i) {
-		isl_union_set_free(leaves[i].domain);
-		isl_union_set_list_free(leaves[i].list);
-		isl_multi_union_pw_aff_free(leaves[i].prefix);
-	}
-
-	free(leaves);
-}
-
-/* Short-hand for retrieving the prefix schedule at "node"
- * in the form of an isl_multi_union_pw_aff.
- */
-static __isl_give isl_multi_union_pw_aff *get_prefix(
-	__isl_keep isl_schedule_node *node)
-{
-	return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
-}
-
-/* Return an array of "n" elements with information extracted from
- * the "n" children of "node" starting at "first", all of which
- * are known to be filtered leaves.
- */
-struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node,
-	int first, int n)
-{
-	int i;
-	isl_ctx *ctx;
-	struct ppcg_grouping_leaf *leaves;
-
-	if (!node)
-		return NULL;
-
-	ctx = isl_schedule_node_get_ctx(node);
-	leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n);
-	if (!leaves)
-		return NULL;
-
-	for (i = 0; i < n; ++i) {
-		isl_schedule_node *child;
-		isl_union_set *domain;
-
-		child = isl_schedule_node_get_child(node, first + i);
-		child = isl_schedule_node_child(child, 0);
-		domain = isl_schedule_node_get_domain(child);
-		leaves[i].domain = isl_union_set_copy(domain);
-		leaves[i].list = isl_union_set_list_from_union_set(domain);
-		leaves[i].prefix = get_prefix(child);
-		isl_schedule_node_free(child);
-	}
-
-	return leaves;
-}
-
-/* Internal data structure used by merge_leaves.
- *
- * "src" and "dst" point to the two consecutive leaves that are
- * under investigation for being merged.
- * "merge" is initially set to 0 and is set to 1 as soon as
- * it turns out that it is useful to merge the two leaves.
- */
-struct ppcg_merge_leaves_data {
-	int merge;
-	struct ppcg_grouping_leaf *src;
-	struct ppcg_grouping_leaf *dst;
-};
-
-/* Given a relation "map" between instances of two statements A and B,
- * does it relate every instance of A (according to the domain of "src")
- * to every instance of B (according to the domain of "dst")?
- */
-static isl_bool covers_src_and_dst(__isl_keep isl_map *map,
-	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
-{
-	isl_space *space;
-	isl_set *set1, *set2;
-	isl_bool is_subset;
-
-	space = isl_space_domain(isl_map_get_space(map));
-	set1 = isl_union_set_extract_set(src->domain, space);
-	set2 = isl_map_domain(isl_map_copy(map));
-	is_subset = isl_set_is_subset(set1, set2);
-	isl_set_free(set1);
-	isl_set_free(set2);
-	if (is_subset < 0 || !is_subset)
-		return is_subset;
-
-	space = isl_space_range(isl_map_get_space(map));
-	set1 = isl_union_set_extract_set(dst->domain, space);
-	set2 = isl_map_range(isl_map_copy(map));
-	is_subset = isl_set_is_subset(set1, set2);
-	isl_set_free(set1);
-	isl_set_free(set2);
-
-	return is_subset;
-}
-
-/* Given a relation "map" between instances of two statements A and B,
- * are pairs of related instances executed together in the input schedule?
- * That is, is each pair of instances assigned the same value
- * by the corresponding prefix schedules?
- *
- * In particular, select the subset of "map" that has pairs of elements
- * with the same value for the prefix schedules and then check
- * if "map" is still a subset of the result.
- */
-static isl_bool matches_prefix(__isl_keep isl_map *map,
-	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
-{
-	isl_union_map *umap, *equal;
-	isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix;
-	isl_bool is_subset;
-
-	src_prefix = isl_multi_union_pw_aff_copy(src->prefix);
-	dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix);
-	prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix);
-
-	umap = isl_union_map_from_map(isl_map_copy(map));
-	equal = isl_union_map_copy(umap);
-	equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix);
-
-	is_subset = isl_union_map_is_subset(umap, equal);
-
-	isl_union_map_free(umap);
-	isl_union_map_free(equal);
-
-	return is_subset;
-}
-
-/* Given a set of validity and proximity schedule constraints "map"
- * between statements in consecutive leaves in a valid schedule,
- * should the two leaves be merged into one?
- *
- * In particular, the two are merged if the constraints form
- * a bijection between every instance of the first statement and
- * every instance of the second statement.  Moreover, each
- * pair of such dependent instances needs to be executed consecutively
- * in the input schedule.  That is, they need to be assigned
- * the same value by their prefix schedules.
- *
- * What this means is that for each instance of the first statement
- * there is exactly one instance of the second statement that
- * is executed immediately after the instance of the first statement and
- * that, moreover, both depends on this statement instance and
- * should be brought as close as possible to this statement instance.
- * In other words, it is both possible to execute the two instances
- * together (according to the input schedule) and desirable to do so
- * (according to the validity and proximity schedule constraints).
- */
-static isl_stat check_merge(__isl_take isl_map *map, void *user)
-{
-	struct ppcg_merge_leaves_data *data = user;
-	isl_bool ok;
-
-	ok = covers_src_and_dst(map, data->src, data->dst);
-	if (ok >= 0 && ok)
-		ok = isl_map_is_bijective(map);
-	if (ok >= 0 && ok)
-		ok = matches_prefix(map, data->src, data->dst);
-
-	isl_map_free(map);
-
-	if (ok < 0)
-		return isl_stat_error;
-	if (!ok)
-		return isl_stat_ok;
-
-	data->merge = 1;
-	return isl_stat_error;
-}
-
-/* Merge the leaves at position "pos" and "pos + 1" in "leaves".
- */
-static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[], int pos)
-{
-	int i;
-
-	leaves[pos].domain = isl_union_set_union(leaves[pos].domain,
-						leaves[pos + 1].domain);
-	leaves[pos].list = isl_union_set_list_concat(leaves[pos].list,
-						leaves[pos + 1].list);
-	leaves[pos].prefix = isl_multi_union_pw_aff_union_add(
-				leaves[pos].prefix, leaves[pos + 1].prefix);
-	for (i = pos + 1; i + 1 < n; ++i)
-		leaves[i] = leaves[i + 1];
-	leaves[n - 1].domain = NULL;
-	leaves[n - 1].list = NULL;
-	leaves[n - 1].prefix = NULL;
-
-	if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix)
-		return isl_stat_error;
-
-	return isl_stat_ok;
-}
-
-/* Merge pairs of consecutive leaves in "leaves" taking into account
- * the intersection of validity and proximity schedule constraints "dep".
- *
- * If a leaf has been merged with the next leaf, then the combination
- * is checked again for merging with the next leaf.
- * That is, if the leaves are A, B and C, then B may not have been
- * merged with C, but after merging A and B, it could still be useful
- * to merge the combination AB with C.
- *
- * Two leaves A and B are merged if there are instances of at least
- * one pair of statements, one statement in A and one B, such that
- * the validity and proximity schedule constraints between them
- * make them suitable for merging according to check_merge.
- *
- * Return the final number of leaves in the sequence, or -1 on error.
- */
-static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[],
-	__isl_keep isl_union_map *dep)
-{
-	int i;
-	struct ppcg_merge_leaves_data data;
-
-	for (i = n - 1; i >= 0; --i) {
-		isl_union_map *dep_i;
-		isl_stat ok;
-
-		if (i + 1 >= n)
-			continue;
-
-		dep_i = isl_union_map_copy(dep);
-		dep_i = isl_union_map_intersect_domain(dep_i,
-				isl_union_set_copy(leaves[i].domain));
-		dep_i = isl_union_map_intersect_range(dep_i,
-				isl_union_set_copy(leaves[i + 1].domain));
-		data.merge = 0;
-		data.src = &leaves[i];
-		data.dst = &leaves[i + 1];
-		ok = isl_union_map_foreach_map(dep_i, &check_merge, &data);
-		isl_union_map_free(dep_i);
-		if (ok < 0 && !data.merge)
-			return -1;
-		if (!data.merge)
-			continue;
-		if (merge_pair(n, leaves, i) < 0)
-			return -1;
-		--n;
-		++i;
-	}
-
-	return n;
-}
-
-/* Construct a schedule with "domain" as domain, that executes
- * the elements of "list" in order (as a sequence).
- */
-static __isl_give isl_schedule *schedule_from_domain_and_list(
-	__isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list)
-{
-	isl_schedule *schedule;
-	isl_schedule_node *node;
-
-	schedule = isl_schedule_from_domain(isl_union_set_copy(domain));
-	node = isl_schedule_get_root(schedule);
-	isl_schedule_free(schedule);
-	node = isl_schedule_node_child(node, 0);
-	list = isl_union_set_list_copy(list);
-	node = isl_schedule_node_insert_sequence(node, list);
-	schedule = isl_schedule_node_get_schedule(node);
-	isl_schedule_node_free(node);
-
-	return schedule;
-}
-
-/* Construct a unique identifier for a group in "grouping".
- *
- * The name is of the form G_n, with n the first value starting at
- * grouping->group_id that does not result in an identifier
- * that is already in use in the domain of the original schedule
- * constraints.
- */
-static isl_id *construct_group_id(struct ppcg_grouping *grouping,
-	__isl_take isl_space *space)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_bool empty;
-	isl_union_set *domain;
-
-	if (!space)
-		return NULL;
-
-	ctx = isl_space_get_ctx(space);
-	domain = isl_schedule_constraints_get_domain(grouping->sc);
-
-	do {
-		char buffer[20];
-		isl_id *id;
-		isl_set *set;
-
-		snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id);
-		grouping->group_id++;
-		id = isl_id_alloc(ctx, buffer, NULL);
-		space = isl_space_set_tuple_id(space, isl_dim_set, id);
-		set = isl_union_set_extract_set(domain, isl_space_copy(space));
-		empty = isl_set_plain_is_empty(set);
-		isl_set_free(set);
-	} while (empty >= 0 && !empty);
-
-	if (empty < 0)
-		space = isl_space_free(space);
-
-	id = isl_space_get_tuple_id(space, isl_dim_set);
-
-	isl_space_free(space);
-	isl_union_set_free(domain);
-
-	return id;
-}
-
-/* Construct a contraction from "prefix" and "domain" for a new group
- * in "grouping".
- *
- * The values of the prefix schedule "prefix" are used as instances
- * of the new group.  The identifier of the group is constructed
- * in such a way that it does not conflict with those of earlier
- * groups nor with statements in the domain of the original
- * schedule constraints.
- * The isl_multi_union_pw_aff "prefix" then simply needs to be
- * converted to an isl_union_pw_multi_aff.  However, this is not
- * possible if "prefix" is zero-dimensional, so in this case,
- * a contraction is constructed from "domain" instead.
- */
-static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain(
-	struct ppcg_grouping *grouping,
-	__isl_keep isl_multi_union_pw_aff *prefix,
-	__isl_keep isl_union_set *domain)
-{
-	isl_id *id;
-	isl_space *space;
-	int dim;
-
-	space = isl_multi_union_pw_aff_get_space(prefix);
-	if (!space)
-		return NULL;
-	dim = isl_space_dim(space, isl_dim_set);
-	id = construct_group_id(grouping, space);
-	if (dim == 0) {
-		isl_multi_val *mv;
-
-		space = isl_multi_union_pw_aff_get_space(prefix);
-		space = isl_space_set_tuple_id(space, isl_dim_set, id);
-		mv = isl_multi_val_zero(space);
-		domain = isl_union_set_copy(domain);
-		return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv);
-	}
-	prefix = isl_multi_union_pw_aff_copy(prefix);
-	prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id);
-	return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix);
-}
-
-/* Extend "grouping" with groups corresponding to merged
- * leaves in the list of potentially merged leaves "leaves".
- *
- * The "list" field of each element in "leaves" contains a list
- * of the instances sets of the original leaves that have been
- * merged into this element.  If at least two of the original leaves
- * have been merged into a given element, then add the corresponding
- * group to "grouping".
- * In particular, the domain is extended with the statement instances
- * of the merged leaves, the contraction is extended with a mapping
- * of these statement instances to instances of a new group and
- * the schedule is extended with a schedule that executes
- * the statement instances according to the order of the leaves
- * in which they appear.
- * Since the instances of the groups should already be scheduled apart
- * in the schedule into which this schedule will be plugged in,
- * the schedules of the individual groups are combined independently
- * of each other (as a set).
- */
-static isl_stat add_groups(struct ppcg_grouping *grouping,
-	int n, struct ppcg_grouping_leaf leaves[])
-{
-	int i;
-
-	for (i = 0; i < n; ++i) {
-		int n_leaf;
-		isl_schedule *schedule;
-		isl_union_set *domain;
-		isl_union_pw_multi_aff *upma;
-
-		n_leaf = isl_union_set_list_n_union_set(leaves[i].list);
-		if (n_leaf < 0)
-			return isl_stat_error;
-		if (n_leaf <= 1)
-			continue;
-		schedule = schedule_from_domain_and_list(leaves[i].domain,
-							leaves[i].list);
-		upma = group_contraction_from_prefix_and_domain(grouping,
-					leaves[i].prefix, leaves[i].domain);
-
-		domain = isl_union_set_copy(leaves[i].domain);
-		if (grouping->domain) {
-			domain = isl_union_set_union(domain, grouping->domain);
-			upma = isl_union_pw_multi_aff_union_add(upma,
-						grouping->contraction);
-			schedule = isl_schedule_set(schedule,
-						grouping->schedule);
-		}
-		grouping->domain = domain;
-		grouping->contraction = upma;
-		grouping->schedule = schedule;
-
-		if (!grouping->domain || !grouping->contraction ||
-		    !grouping->schedule)
-			return isl_stat_error;
-	}
-
-	return isl_stat_ok;
-}
-
-/* Look for any pairs of consecutive leaves among the "n" children of "node"
- * starting at "first" that should be merged together.
- * Store the results in "grouping".
- *
- * First make sure the intersection of validity and proximity
- * schedule constraints is available and extract the required
- * information from the "n" leaves.
- * Then try and merge consecutive leaves based on the validity
- * and proximity constraints.
- * If any pairs were successfully merged, then add groups
- * corresponding to the merged leaves to "grouping".
- */
-static isl_stat group_subsequence(__isl_keep isl_schedule_node *node,
-	int first, int n, struct ppcg_grouping *grouping)
-{
-	int n_merge;
-	struct ppcg_grouping_leaf *leaves;
-
-	if (ppcg_grouping_compute_dep(grouping) < 0)
-		return isl_stat_error;
-
-	leaves = extract_leaves(node, first, n);
-	if (!leaves)
-		return isl_stat_error;
-
-	n_merge = merge_leaves(n, leaves, grouping->dep);
-	if (n_merge >= 0 && n_merge < n &&
-	    add_groups(grouping, n_merge, leaves) < 0)
-		return isl_stat_error;
-
-	ppcg_grouping_leaf_free(n, leaves);
-
-	return isl_stat_ok;
-}
-
-/* If "node" is a sequence, then check if it has any consecutive
- * leaves that should be merged together and store the results
- * in "grouping".
- *
- * In particular, call group_subsequence on each consecutive
- * sequence of (filtered) leaves among the children of "node".
- */
-static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user)
-{
-	int i, n, first;
-	struct ppcg_grouping *grouping = user;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
-		return isl_bool_true;
-
-	n = isl_schedule_node_n_children(node);
-	if (n < 0)
-		return isl_bool_error;
-
-	first = -1;
-	for (i = 0; i < n; ++i) {
-		isl_schedule_node *child;
-		enum isl_schedule_node_type type;
-
-		child = isl_schedule_node_get_child(node, i);
-		child = isl_schedule_node_child(child, 0);
-		type = isl_schedule_node_get_type(child);
-		isl_schedule_node_free(child);
-
-		if (first >= 0 && type != isl_schedule_node_leaf) {
-			if (group_subsequence(node, first, i - first,
-						grouping) < 0)
-				return isl_bool_error;
-			first = -1;
-		}
-		if (first < 0 && type == isl_schedule_node_leaf)
-			first = i;
-	}
-	if (first >= 0) {
-		if (group_subsequence(node, first, n - first, grouping) < 0)
-			return isl_bool_error;
-	}
-
-	return isl_bool_true;
-}
-
-/* Complete "grouping" to cover all statement instances in the domain
- * of grouping->sc.
- *
- * In particular, grouping->domain is set to the full set of statement
- * instances; group->contraction is extended with an identity
- * contraction on the additional instances and group->schedule
- * is extended with an independent schedule on those additional instances.
- * In the extension of group->contraction, the additional instances
- * are split into those belong to different statements and those
- * that belong to some of the same statements.  The first group
- * is replaced by its universe in order to simplify the contraction extension.
- */
-static void complete_grouping(struct ppcg_grouping *grouping)
-{
-	isl_union_set *domain, *left, *overlap;
-	isl_union_pw_multi_aff *upma;
-	isl_schedule *schedule;
-
-	domain = isl_schedule_constraints_get_domain(grouping->sc);
-	left = isl_union_set_subtract(isl_union_set_copy(domain),
-				    isl_union_set_copy(grouping->domain));
-	schedule = isl_schedule_from_domain(isl_union_set_copy(left));
-	schedule = isl_schedule_set(schedule, grouping->schedule);
-	grouping->schedule = schedule;
-
-	overlap = isl_union_set_universe(grouping->domain);
-	grouping->domain = domain;
-	overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap);
-	left = isl_union_set_subtract(left, isl_union_set_copy(overlap));
-	left = isl_union_set_universe(left);
-	left = isl_union_set_union(left, overlap);
-	upma = isl_union_set_identity_union_pw_multi_aff(left);
-	upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction);
-	grouping->contraction = upma;
-}
-
-/* Compute a schedule on the domain of "sc" that respects the schedule
- * constraints in "sc".
- *
- * "schedule" is a known correct schedule that is used to combine
- * groups of statements if options->group_chains is set.
- * In particular, statements that are executed consecutively in a sequence
- * in this schedule and where all instances of the second depend on
- * the instance of the first that is executed in the same iteration
- * of outer band nodes are grouped together into a single statement.
- * The schedule constraints are then mapped to these groups of statements
- * and the resulting schedule is expanded again to refer to the original
- * statements.
- */
-__isl_give isl_schedule *ppcg_compute_schedule(
-	__isl_take isl_schedule_constraints *sc,
-	__isl_keep isl_schedule *schedule, struct ppcg_options *options)
-{
-	struct ppcg_grouping grouping = { sc };
-	isl_union_pw_multi_aff *contraction;
-	isl_union_map *umap;
-	isl_schedule *res, *expansion;
-
-	if (!options->group_chains)
-		return isl_schedule_constraints_compute_schedule(sc);
-
-	grouping.group_id = 0;
-	if (isl_schedule_foreach_schedule_node_top_down(schedule,
-			&detect_groups, &grouping) < 0)
-		goto error;
-	if (!grouping.contraction) {
-		ppcg_grouping_clear(&grouping);
-		return isl_schedule_constraints_compute_schedule(sc);
-	}
-	complete_grouping(&grouping);
-	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
-	umap = isl_union_map_from_union_pw_multi_aff(contraction);
-
-	sc = isl_schedule_constraints_apply(sc, umap);
-
-	res = isl_schedule_constraints_compute_schedule(sc);
-
-	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
-	expansion = isl_schedule_copy(grouping.schedule);
-	res = isl_schedule_expand(res, contraction, expansion);
-
-	ppcg_grouping_clear(&grouping);
-	return res;
-error:
-	ppcg_grouping_clear(&grouping);
-	isl_schedule_constraints_free(sc);
-	return NULL;
-}
diff --git a/polly/lib/External/ppcg/hybrid.h b/polly/lib/External/ppcg/hybrid.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/hybrid.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef HYBRID_H
-#define HYBRID_H
-
-#include <isl/val.h>
-#include <isl/schedule_node.h>
-
-#include "ppcg.h"
-
-struct ppcg_ht_bounds;
-typedef struct ppcg_ht_bounds ppcg_ht_bounds;
-
-struct ppcg_ht_phase;
-typedef struct ppcg_ht_phase ppcg_ht_phase;
-
-isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node);
-isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node);
-
-__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
-	__isl_keep isl_schedule_node *node);
-void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds);
-isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds);
-isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
-	__isl_keep isl_multi_val *sizes);
-__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
-	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
-	__isl_take isl_schedule_node *node, struct ppcg_options *options);
-__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
-	__isl_take ppcg_ht_bounds *bounds);
-
-__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
-	__isl_keep isl_schedule_node *node);
-__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
-	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node);
-__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
-	__isl_take isl_schedule_node *node,
-	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
-		void *user), void *user);
-__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
-	__isl_take isl_schedule_node *node);
-
-#endif
diff --git a/polly/lib/External/ppcg/hybrid.c b/polly/lib/External/ppcg/hybrid.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/hybrid.c
+++ /dev/null
@@ -1,2242 +0,0 @@
-/*
- * Copyright 2013      Ecole Normale Superieure
- * Copyright 2015      Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <string.h>
-
-#include <isl/space.h>
-#include <isl/constraint.h>
-#include <isl/val.h>
-#include <isl/aff.h>
-#include <isl/set.h>
-#include <isl/map.h>
-#include <isl/union_set.h>
-#include <isl/union_map.h>
-
-#include "hybrid.h"
-#include "schedule.h"
-
-/* The hybrid tiling implemented in this file is based on
- * Grosser et al., "Hybrid Hexagonal/Classical Tiling for GPUs".
- */
-
-/* Bounds on relative dependence distances in input to hybrid tiling.
- * upper is an upper bound on the relative dependence distances
- * in the first space dimension
- * -lower is a lower bound on the relative dependence distances
- * in all space dimensions.
- *
- * In particular,
- *
- *	d_i >= -lower_i d_0
- * and
- *	d_1 <= upper d_0
- *
- * for each dependence distance vector d, where d_1 is the component
- * corresponding to the first space dimension.
- *
- * upper and lower are always non-negative.
- * Some of the values may be NaN if no bound could be found.
- */
-struct ppcg_ht_bounds {
-	isl_val *upper;
-	isl_multi_val *lower;
-};
-
-/* Free "bounds" along with all its fields.
- */
-__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
-	__isl_take ppcg_ht_bounds *bounds)
-{
-	if (!bounds)
-		return NULL;
-	isl_val_free(bounds->upper);
-	isl_multi_val_free(bounds->lower);
-	free(bounds);
-
-	return NULL;
-}
-
-/* Create a ppcg_ht_bounds object for a band living in "space".
- * The bounds are initialized to NaN.
- */
-__isl_give ppcg_ht_bounds *ppcg_ht_bounds_alloc(__isl_take isl_space *space)
-{
-	int i, n;
-	isl_ctx *ctx;
-	ppcg_ht_bounds *bounds;
-
-	if (!space)
-		return NULL;
-
-	ctx = isl_space_get_ctx(space);
-	bounds = isl_alloc_type(ctx, struct ppcg_ht_bounds);
-	if (!bounds)
-		goto error;
-	bounds->upper = isl_val_nan(ctx);
-	bounds->lower = isl_multi_val_zero(space);
-	n = isl_multi_val_dim(bounds->lower, isl_dim_set);
-	for (i = 0; i < n; ++i) {
-		isl_val *v = isl_val_copy(bounds->upper);
-		bounds->lower = isl_multi_val_set_val(bounds->lower, i, v);
-	}
-
-	if (!bounds->lower || !bounds->upper)
-		return ppcg_ht_bounds_free(bounds);
-
-	return bounds;
-error:
-	isl_space_free(space);
-	return NULL;
-}
-
-void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds)
-{
-	if (!bounds)
-		return;
-
-	fprintf(stderr, "lower: ");
-	isl_multi_val_dump(bounds->lower);
-	fprintf(stderr, "upper: ");
-	isl_val_dump(bounds->upper);
-}
-
-/* Return the upper bound on the relative dependence distances
- * in the first space dimension.
- */
-__isl_give isl_val *ppcg_ht_bounds_get_upper(__isl_keep ppcg_ht_bounds *bounds)
-{
-	if (!bounds)
-		return NULL;
-	return isl_val_copy(bounds->upper);
-}
-
-/* Replace the upper bound on the relative dependence distances
- * in the first space dimension by "upper".
- */
-__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_upper(
-	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_val *upper)
-{
-	if (!bounds || !upper)
-		goto error;
-	isl_val_free(bounds->upper);
-	bounds->upper = upper;
-	return bounds;
-error:
-	ppcg_ht_bounds_free(bounds);
-	isl_val_free(upper);
-	return NULL;
-}
-
-/* Return the lower bound on the relative dependence distances
- * in space dimension "pos".
- */
-__isl_give isl_val *ppcg_ht_bounds_get_lower(__isl_keep ppcg_ht_bounds *bounds,
-	int pos)
-{
-	if (!bounds)
-		return NULL;
-	return isl_multi_val_get_val(bounds->lower, pos);
-}
-
-/* Replace the lower bound on the relative dependence distances
- * in space dimension "pos" by "lower".
- */
-__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_lower(
-	__isl_take ppcg_ht_bounds *bounds, int pos, __isl_take isl_val *lower)
-{
-	if (!bounds || !lower)
-		goto error;
-	bounds->lower = isl_multi_val_set_val(bounds->lower, pos, lower);
-	if (!bounds->lower)
-		return ppcg_ht_bounds_free(bounds);
-	return bounds;
-error:
-	ppcg_ht_bounds_free(bounds);
-	isl_val_free(lower);
-	return NULL;
-}
-
-/* Can the bounds on relative dependence distances recorded in "bounds"
- * be used to perform hybrid tiling?
- * In particular, have appropriate lower and upper bounds been found?
- * Any NaN indicates that no corresponding bound was found.
- */
-isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds)
-{
-	isl_bool is_nan;
-	int i, n;
-
-	if (!bounds)
-		return isl_bool_error;
-	is_nan = isl_val_is_nan(bounds->upper);
-	if (is_nan < 0)
-		return isl_bool_error;
-	if (is_nan)
-		return isl_bool_false;
-
-	n = isl_multi_val_dim(bounds->lower, isl_dim_set);
-	for (i = 0; i < n; ++i) {
-		isl_val *v;
-
-		v = isl_multi_val_get_val(bounds->lower, i);
-		is_nan = isl_val_is_nan(v);
-		if (is_nan < 0)
-			return isl_bool_error;
-		if (is_nan)
-			return isl_bool_false;
-		isl_val_free(v);
-	}
-
-	return isl_bool_true;
-}
-
-/* Structure that represents the basic hexagonal tiling,
- * along with information that is needed to perform the hybrid tiling.
- *
- * "bounds" are the bounds on the dependence distances that
- * define the hexagonal shape and the required skewing in the remaining
- * space dimensions.
- *
- * "input_node" points to the input pair of band nodes.
- * "input_schedule" is the partial schedule of this input pair of band nodes.
- * The space of this schedule is [P -> C], where P is the space
- * of the parent node and C is the space of the child node.
- *
- * "space_sizes" represent the total size of a tile for the space
- * dimensions, i.e., those corresponding to the child node.
- * The space of "space_sizes" is C.
- * If S_0 is the original tile size in the first space dimension,
- * then the first entry of "space_sizes" is equal to
- * W = 2*S_0 + floor(d_l h) + floor(d_u h).
- * The remaining entries are the same as in the original tile sizes.
- *
- * The basic hexagonal tiling "hex" is defined
- * in a "ts" (time-space) space and corresponds to the phase-1 tiles.
- * "time_tile" maps the "ts" space to outer time tile.
- * Is is equal to ts[t, s] -> floor(t/(2 * S_t)), with S_t the original tile
- * size corresponding to the parent node.
- * "local_time" maps the "ts" space to the time dimension inside each tile.
- * It is equal to ts[t, s] -> t mod (2 S_t), with S_t the original tile
- * size corresponding to the parent node.
- * "shift_space" shifts the tiles at time tile T = floor(t/(2 S_t))
- * in the space dimension such that they align to a multiple of W.
- * It is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W,
- * with shift_s = S_0 + floor(d_u h).
- * "shift_phase" is the shift taken to go from phase 0 to phase 1
- * It is equal to ts[t, s] -> ts[t + S_t, s + shift_s],
- * with shift_s = S_0 + floor(d_u h).
- *
- * "project_ts" projects the space of the input schedule to the ts-space.
- * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0].
- */
-struct ppcg_ht_tiling {
-	int ref;
-
-	ppcg_ht_bounds *bounds;
-	isl_schedule_node *input_node;
-	isl_multi_union_pw_aff *input_schedule;
-
-	isl_multi_val *space_sizes;
-
-	isl_aff *time_tile;
-	isl_aff *local_time;
-	isl_aff *shift_space;
-	isl_multi_aff *shift_phase;
-	isl_set *hex;
-
-	isl_multi_aff *project_ts;
-};
-typedef struct ppcg_ht_tiling ppcg_ht_tiling;
-
-/* Return the space of the pair of band nodes that form the input
- * to the hybrid tiling.
- * In particular, return the space [P -> C], where P is the space
- * of the parent node and C is the space of the child node.
- */
-__isl_give isl_space *ppcg_ht_tiling_get_input_space(
-	__isl_keep ppcg_ht_tiling *tile)
-{
-	if (!tile)
-		return NULL;
-
-	return isl_multi_union_pw_aff_get_space(tile->input_schedule);
-}
-
-/* Remove a reference to "tile" and free "tile" along with all its fields
- * as soon as the reference count drops to zero.
- */
-static __isl_null ppcg_ht_tiling *ppcg_ht_tiling_free(
-	__isl_take ppcg_ht_tiling *tiling)
-{
-	if (!tiling)
-		return NULL;
-	if (--tiling->ref > 0)
-		return NULL;
-
-	ppcg_ht_bounds_free(tiling->bounds);
-	isl_schedule_node_free(tiling->input_node);
-	isl_multi_union_pw_aff_free(tiling->input_schedule);
-	isl_multi_val_free(tiling->space_sizes);
-	isl_aff_free(tiling->time_tile);
-	isl_aff_free(tiling->local_time);
-	isl_aff_free(tiling->shift_space);
-	isl_multi_aff_free(tiling->shift_phase);
-	isl_set_free(tiling->hex);
-	isl_multi_aff_free(tiling->project_ts);
-	free(tiling);
-
-	return NULL;
-}
-
-/* Return a new reference to "tiling".
- */
-__isl_give ppcg_ht_tiling *ppcg_ht_tiling_copy(
-	__isl_keep ppcg_ht_tiling *tiling)
-{
-	if (!tiling)
-		return NULL;
-
-	tiling->ref++;
-	return tiling;
-}
-
-/* Return the isl_ctx to which "tiling" belongs.
- */
-isl_ctx *ppcg_ht_tiling_get_ctx(__isl_keep ppcg_ht_tiling *tiling)
-{
-	if (!tiling)
-		return NULL;
-
-	return isl_multi_union_pw_aff_get_ctx(tiling->input_schedule);
-}
-
-/* Representation of one of the two phases of hybrid tiling.
- *
- * "tiling" points to the shared tiling data.
- *
- * "time_tile", "local_time" and "shift_space" are equal to the corresponding
- * fields of "tiling", pulled back to the input space.
- * In case of phase 0, these expressions have also been moved
- * from phase 1 to phase 0.
- *
- * "domain" contains the hexagonal tiling of this phase.
- *
- * "space_shift" is the shift that should be added to the space band
- * in order to be able to apply rectangular tiling to the space.
- * For phase 1, it is equal to
- *
- *	[P[t] -> C[s_0, s_i]] -> C[(-(2 * shift_s)*T) % W, dl_i * u]
- *
- * with shift_s = S_0 + floor(d_u h),
- * T equal to "time_tile" and u equal to "local_time".
- * For phase 0, it is equal to
- *
- *	[P[t] -> C[s_0, s_i]] -> C[shift_s + (-(2 * shift_s)*T) % W, dl_i * u]
- *
- * "space_tile" is the space tiling.  It is equal to
- *
- *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size]
- */
-struct ppcg_ht_phase {
-	ppcg_ht_tiling *tiling;
-
-	isl_aff *time_tile;
-	isl_aff *local_time;
-	isl_aff *shift_space;
-	isl_set *domain;
-
-	isl_multi_aff *space_shift;
-	isl_multi_aff *space_tile;
-};
-
-/* Free "phase" along with all its fields.
- */
-static __isl_null ppcg_ht_phase *ppcg_ht_phase_free(
-	__isl_take ppcg_ht_phase *phase)
-{
-	if (!phase)
-		return NULL;
-
-	ppcg_ht_tiling_free(phase->tiling);
-	isl_aff_free(phase->time_tile);
-	isl_aff_free(phase->local_time);
-	isl_aff_free(phase->shift_space);
-	isl_set_free(phase->domain);
-	isl_multi_aff_free(phase->space_shift);
-	isl_multi_aff_free(phase->space_tile);
-	free(phase);
-
-	return NULL;
-}
-
-/* Wrapper around ppcg_ht_phase_free for use as an argument
- * to isl_id_set_free_user.
- */
-static void ppcg_ht_phase_free_wrap(void *user)
-{
-	ppcg_ht_phase *phase = user;
-
-	ppcg_ht_phase_free(phase);
-}
-
-/* Return the domain of hybrid tiling phase "phase".
- */
-static __isl_give isl_set *ppcg_ht_phase_get_domain(ppcg_ht_phase *phase)
-{
-	if (!phase)
-		return NULL;
-
-	return isl_set_copy(phase->domain);
-}
-
-/* Return the space of the pair of band nodes that form the input
- * to the hybrid tiling of which "phase" is a phase.
- * In particular, return the space [P -> C], where P is the space
- * of the parent node and C is the space of the child node.
- */
-static __isl_give isl_space *ppcg_ht_phase_get_input_space(
-	__isl_keep ppcg_ht_phase *phase)
-{
-	if (!phase)
-		return NULL;
-
-	return ppcg_ht_tiling_get_input_space(phase->tiling);
-}
-
-/* Construct the lower left constraint of the hexagonal tile, i.e.,
- *
- *	du a - b <= (2h+1) du - duh
- *	-du a + b + (2h+1) du - duh >= 0
- *
- * where duh = floor(du * h).
- *
- * This constraint corresponds to (6) in
- * "Hybrid Hexagonal/Classical Tiling for GPUs".
- */
-static __isl_give isl_constraint *hex_lower_left(__isl_take isl_local_space *ls,
-	__isl_keep isl_val *h, __isl_keep isl_val *du, __isl_keep isl_val *duh)
-{
-	isl_val *v;
-	isl_aff *aff;
-
-	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
-	v = isl_val_mul(v, isl_val_copy(du));
-	v = isl_val_sub(v, isl_val_copy(duh));
-	aff = isl_aff_val_on_domain(ls, v);
-	v = isl_val_neg(isl_val_copy(du));
-	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v);
-	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1);
-
-	return isl_inequality_from_aff(aff);
-}
-
-/* Construct the lower constraint of the hexagonal tile, i.e.,
- *
- *	a <= 2h+1
- *	-a + 2h+1 >= 0
- *
- * This constraint corresponds to (7) in
- * "Hybrid Hexagonal/Classical Tiling for GPUs".
- */
-static __isl_give isl_constraint *hex_lower(__isl_take isl_local_space *ls,
-	__isl_keep isl_val *h)
-{
-	isl_val *v;
-	isl_aff *aff;
-
-	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
-	aff = isl_aff_val_on_domain(ls, v);
-	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 0, -1);
-
-	return isl_inequality_from_aff(aff);
-}
-
-/* Construct the lower right constraint of the hexagonal tile, i.e.,
- *
- *	dl a + b <= (2h+1) dl + duh + (s0-1)
- *	-dl a - b + (2h+1) dl + duh + (s0-1) >= 0
- *
- * where duh = floor(du * h).
- *
- * This constraint corresponds to (8) in
- * "Hybrid Hexagonal/Classical Tiling for GPUs".
- */
-static __isl_give isl_constraint *hex_lower_right(
-	__isl_take isl_local_space *ls, __isl_keep isl_val *h,
-	__isl_keep isl_val *s0, __isl_keep isl_val *dl, __isl_keep isl_val *duh)
-{
-	isl_val *v;
-	isl_aff *aff;
-
-	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
-	v = isl_val_mul(v, isl_val_copy(dl));
-	v = isl_val_add(v, isl_val_copy(duh));
-	v = isl_val_add(v, isl_val_copy(s0));
-	v = isl_val_sub_ui(v, 1);
-	aff = isl_aff_val_on_domain(ls, v);
-	v = isl_val_neg(isl_val_copy(dl));
-	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v);
-	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1);
-
-	return isl_inequality_from_aff(aff);
-}
-
-/* Construct the upper left constraint of the hexagonal tile, i.e.,
- *
- *	dl a + b >= h dl - (d - 1)/d				with d = den(dl)
- *	dl a + b - h dl + (d - 1)/d >= 0
- *
- * This constraint corresponds to (10) in
- * "Hybrid Hexagonal/Classical Tiling for GPUs".
- */
-static __isl_give isl_constraint *hex_upper_left(__isl_take isl_local_space *ls,
-	__isl_keep isl_val *h, __isl_keep isl_val *dl)
-{
-	isl_val *v, *d;
-	isl_aff *aff;
-
-	d = isl_val_get_den_val(dl);
-	v = isl_val_sub_ui(isl_val_copy(d), 1);
-	v = isl_val_div(v, d);
-	v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(dl)));
-	aff = isl_aff_val_on_domain(ls, v);
-	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(dl));
-	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1);
-
-	return isl_inequality_from_aff(aff);
-}
-
-/* Construct the upper right constraint of the hexagonal tile, i.e.,
- *
- *	du a - b >= du h - duh - (s0-1) - dlh - (d - 1)/d	with d = den(du)
- *	du a - b - du h + duh + (s0-1) + dlh + (d - 1)/d >= 0
- *
- * where dlh = floor(dl * h) and duh = floor(du * h).
- *
- * This constraint corresponds to (12) in
- * "Hybrid Hexagonal/Classical Tiling for GPUs".
- */
-static __isl_give isl_constraint *hex_upper_right(
-	__isl_take isl_local_space *ls, __isl_keep isl_val *h,
-	__isl_keep isl_val *s0, __isl_keep isl_val *du,
-	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
-{
-	isl_val *v, *d;
-	isl_aff *aff;
-
-	d = isl_val_get_den_val(du);
-	v = isl_val_sub_ui(isl_val_copy(d), 1);
-	v = isl_val_div(v, d);
-	v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(du)));
-	v = isl_val_add(v, isl_val_copy(duh));
-	v = isl_val_add(v, isl_val_copy(dlh));
-	v = isl_val_add(v, isl_val_copy(s0));
-	v = isl_val_sub_ui(v, 1);
-	aff = isl_aff_val_on_domain(ls, v);
-	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(du));
-	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1);
-
-	return isl_inequality_from_aff(aff);
-}
-
-/* Construct the uppper constraint of the hexagonal tile, i.e.,
- *
- *	a >= 0
- *
- * This constraint corresponds to (13) in
- * "Hybrid Hexagonal/Classical Tiling for GPUs".
- */
-static __isl_give isl_constraint *hex_upper(__isl_take isl_local_space *ls)
-{
-	isl_aff *aff;
-
-	aff = isl_aff_var_on_domain(ls, isl_dim_set, 0);
-
-	return isl_inequality_from_aff(aff);
-}
-
-/* Construct the basic hexagonal tile shape.
- * "space" is the 2D space in which the hexagon should be constructed.
- * h is st-1, with st the tile size in the time dimension
- * s0 is the tile size in the space dimension
- * dl is a bound on the negative relative dependence distances, i.e.,
- *
- *	d_s >= -dl d_t
- *
- * du is a bound on the positive relative dependence distances, i.e.,
- *
- *	d_s <= du d_t
- *
- * with (d_t,d_s) any dependence distance vector.
- * dlh = floor(dl * h)
- * duh = floor(du * h)
- *
- * The shape of the hexagon is as follows:
- *
- *		0 dlh   dlh+s0-1
- *		   ______                __
- * 0		  /      \_             /
- *		 /         \_          /
- * h		/            \ ______ /
- * h+1		\_           //      \\_
- *		  \_        //         \\_
- * 2h+1		    \______//            \\
- *		0   duh   duh+s0-1
- *		             duh+s0-1+dlh
- *		                  duh+s0-1+dlh+1+s0+1
- *
- * The next hexagon is shifted by duh + dlh + 2 * s0.
- *
- * The slope of the "/" constraints is dl.
- * The slope of the "\_" constraints is du.
- */
-static __isl_give isl_set *compute_hexagon(__isl_take isl_space *space,
-	__isl_keep isl_val *h, __isl_keep isl_val *s0,
-	__isl_keep isl_val *dl, __isl_keep isl_val *du,
-	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
-{
-	isl_local_space *ls;
-	isl_constraint *c;
-	isl_basic_set *bset;
-
-	ls = isl_local_space_from_space(space);
-
-	c = hex_lower_left(isl_local_space_copy(ls), h, du, duh);
-	bset = isl_basic_set_from_constraint(c);
-
-	c = hex_lower(isl_local_space_copy(ls), h);
-	bset = isl_basic_set_add_constraint(bset, c);
-
-	c = hex_lower_right(isl_local_space_copy(ls), h, s0, dl, duh);
-	bset = isl_basic_set_add_constraint(bset, c);
-
-	c = hex_upper_left(isl_local_space_copy(ls), h, dl);
-	bset = isl_basic_set_add_constraint(bset, c);
-
-	c = hex_upper_right(isl_local_space_copy(ls), h, s0, du, dlh, duh);
-	bset = isl_basic_set_add_constraint(bset, c);
-
-	c = hex_upper(ls);
-	bset = isl_basic_set_add_constraint(bset, c);
-
-	return isl_set_from_basic_set(bset);
-}
-
-/* Name of the ts-space.
- */
-static const char *ts_space_name = "ts";
-
-/* Construct and return the space ts[t, s].
- */
-static __isl_give isl_space *construct_ts_space(isl_ctx *ctx)
-{
-	isl_space *s;
-
-	s = isl_space_set_alloc(ctx, 0, 2);
-	s = isl_space_set_tuple_name(s, isl_dim_set, ts_space_name);
-
-	return s;
-}
-
-/* Name of the local ts-space.
- */
-static const char *local_ts_space_name = "local_ts";
-
-/* Construct and return the space local_ts[t, s].
- */
-static __isl_give isl_space *construct_local_ts_space(isl_ctx *ctx)
-{
-	isl_space *s;
-
-	s = isl_space_set_alloc(ctx, 0, 2);
-	s = isl_space_set_tuple_name(s, isl_dim_set, local_ts_space_name);
-
-	return s;
-}
-
-/* Compute the total size of a tile for the space dimensions,
- * i.e., those corresponding to the child node
- * of the input pattern.
- * If S_0 is the original tile size in the first space dimension,
- * then the first entry of "space_sizes" is equal to
- * W = 2*S_0 + floor(d_l h) + floor(d_u h).
- * The remaining entries are the same as in the original tile sizes.
- * "tile_sizes" contains the original tile sizes, including
- * the tile size corresponding to the parent node.
- * "dlh" is equal to floor(d_l h).
- * "duh" is equal to floor(d_u h).
- */
-static __isl_give isl_multi_val *compute_space_sizes(
-	__isl_keep isl_multi_val *tile_sizes,
-	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
-{
-	isl_val *size;
-	isl_multi_val *space_sizes;
-
-	space_sizes = isl_multi_val_copy(tile_sizes);
-	space_sizes = isl_multi_val_factor_range(space_sizes);
-	size = isl_multi_val_get_val(space_sizes, 0);
-	size = isl_val_mul_ui(size, 2);
-	size = isl_val_add(size, isl_val_copy(duh));
-	size = isl_val_add(size, isl_val_copy(dlh));
-	space_sizes = isl_multi_val_set_val(space_sizes, 0, size);
-
-	return space_sizes;
-}
-
-/* Compute the offset of phase 1 with respect to phase 0
- * in the ts-space ("space").
- * In particular, return
- *
- *	ts[st, s0 + duh]
- */
-static __isl_give isl_multi_val *compute_phase_shift(
-	__isl_keep isl_space *space, __isl_keep isl_val *st,
-	__isl_keep isl_val *s0, __isl_keep isl_val *duh)
-{
-	isl_val *v;
-	isl_multi_val *phase_shift;
-
-	phase_shift = isl_multi_val_zero(isl_space_copy(space));
-	phase_shift = isl_multi_val_set_val(phase_shift, 0, isl_val_copy(st));
-	v = isl_val_add(isl_val_copy(duh), isl_val_copy(s0));
-	phase_shift = isl_multi_val_set_val(phase_shift, 1, v);
-
-	return phase_shift;
-}
-
-/* Return the function
- *
- *	ts[t, s] -> floor(t/(2 * st))
- *
- * representing the time tile.
- * "space" is the space ts[t, s].
- */
-static __isl_give isl_aff *compute_time_tile(__isl_keep isl_space *space,
-	__isl_keep isl_val *st)
-{
-	isl_val *v;
-	isl_aff *t;
-	isl_local_space *ls;
-
-	ls = isl_local_space_from_space(isl_space_copy(space));
-	t = isl_aff_var_on_domain(ls, isl_dim_set, 0);
-	v = isl_val_mul_ui(isl_val_copy(st), 2);
-	t = isl_aff_floor(isl_aff_scale_down_val(t, v));
-
-	return t;
-}
-
-/* Compute a shift in the space dimension for tiles
- * at time tile T = floor(t/(2 * S_t))
- * such that they align to a multiple of the total space tile dimension W.
- * In particular, compute
- *
- *	ts[t, s] -> s + (-(2 * shift_s)*T) % W
- *
- * where shift_s is the shift of phase 1 with respect to phase 0
- * in the space dimension (the first element of "phase_shift").
- * W is stored in the first element of "space_sizes".
- * "time_tile" is the function
- *
- *	ts[t, s] -> floor(t/(2 * S_T))
- *
- * Since phase 1 is shifted by shift_s with respect to phase 0,
- * the next line of phase 0 (at T+1) is shifted by 2*shift_s
- * with respect to the previous line (at T).
- * A shift of -(2 * shift_s)*T therefore allows the basic pattern
- * (which starts at 0) to be applied.
- * However, this shift will be used to obtain the tile coordinate
- * in the first space dimension and if the original values
- * in the space dimension are non-negative, then the shift should
- * not make them negative.  Moreover, the shift should be as minimal
- * as possible.
- * Since the pattern repeats itself with a period of W in the space
- * dimension, the shift can be replaced by (-(2 * shift_s)*T) % W.
- */
-static __isl_give isl_aff *compute_shift_space(__isl_keep isl_aff *time_tile,
-	__isl_keep isl_multi_val *space_sizes,
-	__isl_keep isl_multi_val *phase_shift)
-{
-	isl_val *v;
-	isl_aff *s, *t;
-	isl_local_space *ls;
-
-	ls = isl_local_space_from_space(isl_aff_get_domain_space(time_tile));
-	t = isl_aff_copy(time_tile);
-	v = isl_val_mul_ui(isl_multi_val_get_val(phase_shift, 1), 2);
-	v = isl_val_neg(v);
-	t = isl_aff_scale_val(t, v);
-	v = isl_multi_val_get_val(space_sizes, 0);
-	t = isl_aff_mod_val(t, v);
-	s = isl_aff_var_on_domain(ls, isl_dim_set, 1);
-	s = isl_aff_add(s, t);
-
-	return s;
-}
-
-/* Give the phase_shift ts[S_t, S_0 + floor(d_u h)],
- * compute a function that applies the shift, i.e.,
- *
- *	ts[t, s] -> ts[t + S_t, s + S_0 + floor(d_u h)],
- */
-static __isl_give isl_multi_aff *compute_shift_phase(
-	__isl_keep isl_multi_val *phase_shift)
-{
-	isl_space *space;
-	isl_multi_aff *shift;
-
-	space = isl_multi_val_get_space(phase_shift);
-	shift = isl_multi_aff_multi_val_on_space(space,
-					isl_multi_val_copy(phase_shift));
-	space = isl_multi_aff_get_space(shift);
-	shift = isl_multi_aff_add(shift, isl_multi_aff_identity(space));
-
-	return shift;
-}
-
-/* Compute a mapping from the ts-space to the local coordinates
- * within each tile.  In particular, compute
- *
- *	ts[t, s] -> local_ts[t % (2 S_t), (s + (-(2 * shift_s)*T) % W) % W]
- *
- * "ts" is the space ts[t, s]
- * "local_ts" is the space local_ts[t, s]
- * "shift_space" is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W
- * "st" is the tile size in the time dimension S_t.
- * The first element of "space_sizes" is equal to W.
- */
-static __isl_give isl_multi_aff *compute_localize(
-	__isl_keep isl_space *local_ts, __isl_keep isl_aff *shift_space,
-	__isl_keep isl_val *st, __isl_keep isl_multi_val *space_sizes)
-{
-	isl_val *v;
-	isl_space *space;
-	isl_aff *s, *t;
-	isl_multi_aff *localize;
-
-	space = isl_aff_get_domain_space(shift_space);
-	local_ts = isl_space_copy(local_ts);
-	space = isl_space_map_from_domain_and_range(space, local_ts);
-	localize = isl_multi_aff_identity(space);
-	t = isl_multi_aff_get_aff(localize, 0);
-	v = isl_val_mul_ui(isl_val_copy(st), 2);
-	t = isl_aff_mod_val(t, v);
-	localize = isl_multi_aff_set_aff(localize, 0, t);
-	s = isl_aff_copy(shift_space);
-	v = isl_multi_val_get_val(space_sizes, 0);
-	s = isl_aff_mod_val(s, v);
-	localize = isl_multi_aff_set_aff(localize, 1, s);
-
-	return localize;
-}
-
-/* Set the project_ts field of "tiling".
- *
- * This field projects the space of the input schedule to the ts-space.
- * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0].
- */
-static __isl_give ppcg_ht_tiling *ppcg_ht_tiling_set_project_ts(
-	__isl_take ppcg_ht_tiling *tiling)
-{
-	int n;
-	isl_space *space;
-	isl_multi_aff *project;
-
-	if (!tiling)
-		return NULL;
-
-	space = ppcg_ht_tiling_get_input_space(tiling);
-	n = isl_space_dim(space, isl_dim_set);
-	project = isl_multi_aff_project_out_map(space, isl_dim_set, 2, n - 2);
-	project = isl_multi_aff_set_tuple_name(project,
-						isl_dim_out, ts_space_name);
-	if (!project)
-		return ppcg_ht_tiling_free(tiling);
-
-	tiling->project_ts = project;
-
-	return tiling;
-}
-
-/* Construct a hybrid tiling description from bounds on the dependence
- * distances "bounds".
- * "input_node" points to the original parent node.
- * "input_schedule" is the combined schedule of the parent and child
- * node in the input.
- * "tile_sizes" are the original, user specified tile sizes.
- */
-static __isl_give ppcg_ht_tiling *ppcg_ht_bounds_construct_tiling(
-	__isl_take ppcg_ht_bounds *bounds,
-	__isl_keep isl_schedule_node *input_node,
-	__isl_keep isl_multi_union_pw_aff *input_schedule,
-	__isl_keep isl_multi_val *tile_sizes)
-{
-	isl_ctx *ctx;
-	ppcg_ht_tiling *tiling;
-	isl_multi_val *space_sizes, *phase_shift;
-	isl_aff *time_tile, *shift_space;
-	isl_multi_aff *localize;
-	isl_val *h, *duh, *dlh;
-	isl_val *st, *s0, *du, *dl;
-	isl_space *ts, *local_ts;
-
-	if (!bounds || !input_node || !input_schedule || !tile_sizes)
-		goto error;
-
-	ctx = isl_multi_union_pw_aff_get_ctx(input_schedule);
-	tiling = isl_calloc_type(ctx, struct ppcg_ht_tiling);
-	if (!tiling)
-		goto error;
-	tiling->ref = 1;
-
-	st = isl_multi_val_get_val(tile_sizes, 0);
-	h = isl_val_sub_ui(isl_val_copy(st), 1);
-	s0 = isl_multi_val_get_val(tile_sizes, 1);
-	du = ppcg_ht_bounds_get_upper(bounds);
-	dl = ppcg_ht_bounds_get_lower(bounds, 0);
-
-	duh = isl_val_floor(isl_val_mul(isl_val_copy(du), isl_val_copy(h)));
-	dlh = isl_val_floor(isl_val_mul(isl_val_copy(dl), isl_val_copy(h)));
-
-	ts = construct_ts_space(ctx);
-	local_ts = construct_local_ts_space(ctx);
-
-	space_sizes = compute_space_sizes(tile_sizes, dlh, duh);
-	phase_shift = compute_phase_shift(ts, st, s0, duh);
-	time_tile = compute_time_tile(ts, st);
-	shift_space = compute_shift_space(time_tile, space_sizes, phase_shift);
-	localize = compute_localize(local_ts, shift_space, st, space_sizes);
-	isl_space_free(ts);
-
-	tiling->input_node = isl_schedule_node_copy(input_node);
-	tiling->input_schedule = isl_multi_union_pw_aff_copy(input_schedule);
-	tiling->space_sizes = space_sizes;
-	tiling->bounds = bounds;
-	tiling->local_time = isl_multi_aff_get_aff(localize, 0);
-	tiling->hex = compute_hexagon(local_ts, h, s0, dl, du, dlh, duh);
-	tiling->hex = isl_set_preimage_multi_aff(tiling->hex, localize);
-	tiling->time_tile = time_tile;
-	tiling->shift_space = shift_space;
-	tiling->shift_phase = compute_shift_phase(phase_shift);
-	isl_multi_val_free(phase_shift);
-
-	isl_val_free(duh);
-	isl_val_free(dlh);
-	isl_val_free(du);
-	isl_val_free(dl);
-	isl_val_free(s0);
-	isl_val_free(st);
-	isl_val_free(h);
-
-	if (!tiling->input_schedule || !tiling->local_time || !tiling->hex ||
-	    !tiling->shift_space || !tiling->shift_phase)
-		return ppcg_ht_tiling_free(tiling);
-
-	tiling = ppcg_ht_tiling_set_project_ts(tiling);
-
-	return tiling;
-error:
-	ppcg_ht_bounds_free(bounds);
-	return NULL;
-}
-
-/* Are all members of the band node "node" coincident?
- */
-static isl_bool all_coincident(__isl_keep isl_schedule_node *node)
-{
-	int i, n;
-
-	n = isl_schedule_node_band_n_member(node);
-	for (i = 0; i < n; ++i) {
-		isl_bool c;
-
-		c = isl_schedule_node_band_member_get_coincident(node, i);
-		if (c < 0 || !c)
-			return c;
-	}
-
-	return isl_bool_true;
-}
-
-/* Does "node" satisfy the properties of the inner node in the input
- * pattern for hybrid tiling?
- * That is, is it a band node with only coincident members, of which
- * there is at least one?
- */
-static isl_bool has_child_properties(__isl_keep isl_schedule_node *node)
-{
-	if (!node)
-		return isl_bool_error;
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		return isl_bool_false;
-	if (isl_schedule_node_band_n_member(node) < 1)
-		return isl_bool_false;
-	return all_coincident(node);
-}
-
-/* Does "node" satisfy the properties of the outer node in the input
- * pattern for hybrid tiling?
- * That is, is it a band node with a single member?
- */
-static isl_bool has_parent_properties(__isl_keep isl_schedule_node *node)
-{
-	if (!node)
-		return isl_bool_error;
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		return isl_bool_false;
-	if (isl_schedule_node_band_n_member(node) != 1)
-		return isl_bool_false;
-	return isl_bool_true;
-}
-
-/* Does the parent of "node" satisfy the input patttern for hybrid tiling?
- * That is, does "node" satisfy the properties of the inner node and
- * does the parent of "node" satisfy the properties of the outer node?
- */
-isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node)
-{
-	isl_bool has_pattern;
-
-	has_pattern = has_child_properties(node);
-	if (has_pattern < 0 || !has_pattern)
-		return has_pattern;
-
-	node = isl_schedule_node_copy(node);
-	node = isl_schedule_node_parent(node);
-	has_pattern = has_parent_properties(node);
-	isl_schedule_node_free(node);
-
-	return has_pattern;
-}
-
-/* Does "node" satisfy the input patttern for hybrid tiling?
- * That is, does "node" satisfy the properties of the outer node and
- * does the child of "node" satisfy the properties of the inner node?
- */
-isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node)
-{
-	isl_bool has_pattern;
-
-	has_pattern = has_parent_properties(node);
-	if (has_pattern < 0 || !has_pattern)
-		return has_pattern;
-
-	node = isl_schedule_node_get_child(node, 0);
-	has_pattern = has_child_properties(node);
-	isl_schedule_node_free(node);
-
-	return has_pattern;
-}
-
-/* Check that "node" satisfies the input pattern for hybrid tiling.
- * Error out if it does not.
- */
-static isl_stat check_input_pattern(__isl_keep isl_schedule_node *node)
-{
-	isl_bool has_pattern;
-
-	has_pattern = ppcg_ht_has_input_pattern(node);
-	if (has_pattern < 0)
-		return isl_stat_error;
-	if (!has_pattern)
-		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
-			"invalid input pattern for hybrid tiling",
-			return isl_stat_error);
-
-	return isl_stat_ok;
-}
-
-/* Extract the input schedule from "node", i.e., the product
- * of the partial schedules of the parent and child nodes
- * in the input pattern.
- */
-static __isl_give isl_multi_union_pw_aff *extract_input_schedule(
-	__isl_keep isl_schedule_node *node)
-{
-	isl_multi_union_pw_aff *partial, *partial2;
-
-	partial = isl_schedule_node_band_get_partial_schedule(node);
-	node = isl_schedule_node_get_child(node, 0);
-	partial2 = isl_schedule_node_band_get_partial_schedule(node);
-	isl_schedule_node_free(node);
-
-	return isl_multi_union_pw_aff_range_product(partial, partial2);
-}
-
-/* Collect all dependences from "scop" that are relevant for performing
- * hybrid tiling on "node" and its child and map them to the schedule
- * space of this pair of nodes.
- *
- * In case live range reordering is not used,
- * the flow and the false dependences are collected.
- * In case live range reordering is used,
- * the flow and the forced dependences are collected, as well
- * as the order dependences that are adjacent to non-local
- * flow dependences.
- *
- * In all cases, only dependences that map to the same instance
- * of the outer part of the schedule are considered.
- */
-static __isl_give isl_map *collect_deps(struct ppcg_scop *scop,
-	__isl_keep isl_schedule_node *node)
-{
-	isl_space *space;
-	isl_multi_union_pw_aff *prefix, *partial;
-	isl_union_map *flow, *other, *dep, *umap;
-	isl_map *map;
-
-	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
-	partial = extract_input_schedule(node);
-	space = isl_multi_union_pw_aff_get_space(partial);
-
-	flow = isl_union_map_copy(scop->dep_flow);
-	flow = isl_union_map_eq_at_multi_union_pw_aff(flow,
-					isl_multi_union_pw_aff_copy(prefix));
-	if (!scop->options->live_range_reordering) {
-		other = isl_union_map_copy(scop->dep_false);
-		other = isl_union_map_eq_at_multi_union_pw_aff(other, prefix);
-	} else {
-		isl_union_map *local, *non_local, *order, *adj;
-		isl_union_set *domain, *range;
-
-		other = isl_union_map_copy(scop->dep_forced);
-		other = isl_union_map_eq_at_multi_union_pw_aff(other,
-					isl_multi_union_pw_aff_copy(prefix));
-		local = isl_union_map_copy(flow);
-		local = isl_union_map_eq_at_multi_union_pw_aff(local,
-					isl_multi_union_pw_aff_copy(partial));
-		non_local = isl_union_map_copy(flow);
-		non_local = isl_union_map_subtract(non_local, local);
-
-		order = isl_union_map_copy(scop->dep_order);
-		order = isl_union_map_eq_at_multi_union_pw_aff(order, prefix);
-		adj = isl_union_map_copy(order);
-		domain = isl_union_map_domain(isl_union_map_copy(non_local));
-		domain = isl_union_set_coalesce(domain);
-		adj = isl_union_map_intersect_range(adj, domain);
-		other = isl_union_map_union(other, adj);
-
-		adj = order;
-		range = isl_union_map_range(non_local);
-		range = isl_union_set_coalesce(range);
-		adj = isl_union_map_intersect_domain(adj, range);
-		other = isl_union_map_union(other, adj);
-	}
-	dep = isl_union_map_union(flow, other);
-
-	umap = isl_union_map_from_multi_union_pw_aff(partial);
-	dep = isl_union_map_apply_domain(dep, isl_union_map_copy(umap));
-	dep = isl_union_map_apply_range(dep, umap);
-
-	space = isl_space_map_from_set(space);
-	map = isl_union_map_extract_map(dep, space);
-	isl_union_map_free(dep);
-
-	map = isl_map_coalesce(map);
-
-	return map;
-}
-
-/* Given a constraint of the form
- *
- *	a i_0 + b i_1 >= 0
- * or
- *	a i_0 + b i_1 = 0
- *
- * use it to update one or both of the non-negative bounds
- * in "list" = (min, max) such that
- *
- *	i_1 >= -min i_0
- * and
- *	i_1 <= max i_0
- *
- * If b = 0, then the constraint cannot be used.
- * Otherwise, the constraint is equivalent to
- *
- *	sgn(b) i_1 >= - a/abs(b) i_0
- * i.e.,
- *	i_1 >= - a/abs(b) i_0
- * or
- *	i_1 <= a/abs(b) i_0
- *
- * Set the first or second element of "list" to max(0, a/abs(b)),
- * according to the sign of "b".  Or set both in case the constraint
- * is an equality, taking into account the sign change.
- */
-static __isl_give isl_val_list *list_set_min_max(__isl_take isl_val_list *list,
-	__isl_keep isl_constraint *c)
-{
-	isl_val *a, *b;
-	int sign;
-	int pos;
-	isl_bool eq, is_zero, is_neg;
-
-	eq = isl_constraint_is_equality(c);
-	if (eq < 0)
-		return isl_val_list_free(list);
-
-	b = isl_constraint_get_coefficient_val(c, isl_dim_set, 1);
-	is_zero = isl_val_is_zero(b);
-	if (is_zero == isl_bool_true) {
-		isl_val_free(b);
-		return list;
-	}
-	a = isl_constraint_get_coefficient_val(c, isl_dim_set, 0);
-	sign = isl_val_sgn(b);
-	b = isl_val_abs(b);
-	a = isl_val_div(a, b);
-
-	if (eq)
-		b = isl_val_copy(a);
-
-	pos = sign > 0 ? 0 : 1;
-	is_neg = isl_val_is_neg(a);
-	if (is_neg == isl_bool_true)
-		a = isl_val_set_si(a, 0);
-	list = isl_val_list_set_val(list, pos, a);
-
-	if (!eq)
-		return is_neg < 0 ? isl_val_list_free(list) : list;
-
-	pos = 1 - pos;
-	a = isl_val_neg(b);
-	is_neg = isl_val_is_neg(a);
-	if (is_neg == isl_bool_true)
-		a = isl_val_set_si(a, 0);
-	list = isl_val_list_set_val(list, pos, a);
-
-	return is_neg < 0 ? isl_val_list_free(list) : list;
-}
-
-/* If constraint "c" passes through the origin, then try and use it
- * to update the non-negative bounds in "list" = (min, max) such that
- *
- *	i_1 >= -min i_0
- * and
- *	i_1 <= max i_0
- */
-static isl_stat set_min_max(__isl_take isl_constraint *c, void *user)
-{
-	isl_val *v;
-	isl_val_list **list = user;
-	isl_bool is_zero;
-
-	v = isl_constraint_get_constant_val(c);
-	is_zero = isl_val_is_zero(v);
-	isl_val_free(v);
-
-	if (is_zero == isl_bool_true)
-		*list = list_set_min_max(*list, c);
-
-	isl_constraint_free(c);
-	return is_zero < 0 ? isl_stat_error : isl_stat_ok;
-}
-
-/* Given a set of dependence distance vectors "dist", compute
- * pair of non-negative bounds min and max such that
- *
- *	d_pos >= -min d_0
- * and
- *	d_pos <= max d_0
- *
- * and return the pair (min, max).
- * If no bound can be found in either direction, then the bound
- * is replaced by NaN.
- *
- * The dependence distances are first projected onto the (d_0, d_pos).
- * Then the zero dependence distance is added and the convex hull is computed.
- * Finally, the bounds are extracted from the constraints of the convex hull
- * that pass through the origin.
- */
-static __isl_give isl_val_list *min_max_dist(__isl_keep isl_set *dist, int pos)
-{
-	isl_space *space;
-	isl_basic_set *hull;
-	int dim;
-	isl_ctx *ctx;
-	isl_val *nan;
-	isl_val_list *list;
-
-	ctx = isl_set_get_ctx(dist);
-	nan = isl_val_nan(ctx);
-	list = isl_val_list_alloc(ctx, 2);
-	list = isl_val_list_add(list, isl_val_copy(nan));
-	list = isl_val_list_add(list, nan);
-
-	dist = isl_set_copy(dist);
-	dim = isl_set_dim(dist, isl_dim_set);
-	if (dist && pos >= dim)
-		isl_die(ctx, isl_error_internal, "position out of bounds",
-			dist = isl_set_free(dist));
-	dist = isl_set_project_out(dist, isl_dim_set, pos + 1, dim - (pos + 1));
-	dist = isl_set_project_out(dist, isl_dim_set, 1, pos - 1);
-
-	space = isl_set_get_space(dist);
-	dist = isl_set_union(dist, isl_set_from_point(isl_point_zero(space)));
-	dist = isl_set_remove_divs(dist);
-	hull = isl_set_convex_hull(dist);
-
-	if (isl_basic_set_foreach_constraint(hull, &set_min_max, &list) < 0)
-		list = isl_val_list_free(list);
-	isl_basic_set_free(hull);
-
-	return list;
-}
-
-/* Given a schedule node "node" that, together with its child,
- * satisfies the input pattern for hybrid tiling, compute bounds
- * on the relative dependence distances of the child node with
- * respect to the parent node.  These bounds are needed to
- * construct a hybrid tiling.
- *
- * First all relevant dependences are collected and mapped
- * to the schedule space of the pair of nodes.  Then, the
- * dependence distances are computed in this space.
- *
- * These dependence distances are then projected onto a two-dimensional
- * space consisting of the single schedule dimension of the outer node
- * and one of the schedule dimensions of the inner node.
- * The maximal and minimal relative dependence distances are extracted
- * from these projections.
- * This process is repeated for each of the schedule dimensions
- * of the inner node.  For the first dimension, both minimal and
- * maximal relative dependence distances are stored in the result.
- * For the other dimensions, only the minimal relative dependence
- * distance is stored.
- */
-__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
-	__isl_keep isl_schedule_node *node)
-{
-	ppcg_ht_bounds *bnd;
-	isl_space *space;
-	isl_map *map;
-	isl_set *dist;
-	isl_val_list *pair;
-	isl_schedule_node *child;
-	int n;
-	int i, dim;
-
-	if (!scop || !node || check_input_pattern(node) < 0)
-		return NULL;
-
-	child = isl_schedule_node_get_child(node, 0);
-	space = isl_schedule_node_band_get_space(child);
-	dim = isl_schedule_node_band_n_member(child);
-	isl_schedule_node_free(child);
-	bnd = ppcg_ht_bounds_alloc(space);
-	if (!bnd)
-		return NULL;
-
-	map = collect_deps(scop, node);
-
-	dist = isl_map_deltas(map);
-	n = isl_set_dim(dist, isl_dim_param);
-	dist = isl_set_project_out(dist, isl_dim_param, 0, n);
-
-	pair = min_max_dist(dist, 1);
-	bnd = ppcg_ht_bounds_set_lower(bnd, 0, isl_val_list_get_val(pair, 0));
-	bnd = ppcg_ht_bounds_set_upper(bnd, isl_val_list_get_val(pair, 1));
-	isl_val_list_free(pair);
-
-	for (i = 1; i < dim; ++i) {
-		pair = min_max_dist(dist, 1 + i);
-		bnd = ppcg_ht_bounds_set_lower(bnd, i,
-						isl_val_list_get_val(pair, 0));
-		isl_val_list_free(pair);
-	}
-
-	isl_set_free(dist);
-
-	return bnd;
-}
-
-/* Check if all the fields of "phase" are valid, freeing "phase"
- * if they are not.
- */
-static __isl_give ppcg_ht_phase *check_phase(__isl_take ppcg_ht_phase *phase)
-{
-	if (!phase)
-		return NULL;
-
-	if (!phase->tiling || !phase->local_time ||
-	    !phase->shift_space || !phase->domain)
-		return ppcg_ht_phase_free(phase);
-
-	return phase;
-}
-
-/* Construct a ppcg_ht_phase object, that simply copies
- * information from "tiling".
- * That is, the result is defined over the "ts" space and
- * corresponds to phase 1.
- */
-static __isl_give ppcg_ht_phase *construct_phase(
-	__isl_keep ppcg_ht_tiling *tiling)
-{
-	isl_ctx *ctx;
-	ppcg_ht_phase *phase;
-
-	if (!tiling)
-		return NULL;
-
-	ctx = ppcg_ht_tiling_get_ctx(tiling);
-	phase = isl_calloc_type(ctx, struct ppcg_ht_phase);
-	if (!phase)
-		return NULL;
-	phase->tiling = ppcg_ht_tiling_copy(tiling);
-	phase->time_tile = isl_aff_copy(tiling->time_tile);
-	phase->local_time = isl_aff_copy(tiling->local_time);
-	phase->shift_space = isl_aff_copy(tiling->shift_space);
-	phase->domain = isl_set_copy(tiling->hex);
-
-	return check_phase(phase);
-}
-
-/* Align the parameters of the elements of "phase" to those of "space".
- */
-static __isl_give ppcg_ht_phase *phase_align_params(
-	__isl_take ppcg_ht_phase *phase, __isl_take isl_space *space)
-{
-	if (!phase)
-		goto error;
-
-	phase->time_tile = isl_aff_align_params(phase->time_tile,
-							isl_space_copy(space));
-	phase->local_time = isl_aff_align_params(phase->local_time,
-							isl_space_copy(space));
-	phase->shift_space = isl_aff_align_params(phase->shift_space,
-							isl_space_copy(space));
-	phase->domain = isl_set_align_params(phase->domain, space);
-
-	return check_phase(phase);
-error:
-	isl_space_free(space);
-	return NULL;
-}
-
-/* Pull back "phase" over "ma".
- * That is, take a phase defined over the range of "ma" and
- * turn it into a phase defined over the domain of "ma".
- */
-static __isl_give ppcg_ht_phase *pullback_phase(__isl_take ppcg_ht_phase *phase,
-	__isl_take isl_multi_aff *ma)
-{
-	phase = phase_align_params(phase, isl_multi_aff_get_space(ma));
-	if (!phase)
-		goto error;
-
-	phase->time_tile = isl_aff_pullback_multi_aff(phase->time_tile,
-							isl_multi_aff_copy(ma));
-	phase->local_time = isl_aff_pullback_multi_aff(phase->local_time,
-							isl_multi_aff_copy(ma));
-	phase->shift_space = isl_aff_pullback_multi_aff(phase->shift_space,
-							isl_multi_aff_copy(ma));
-	phase->domain = isl_set_preimage_multi_aff(phase->domain, ma);
-
-	return check_phase(phase);
-error:
-	isl_multi_aff_free(ma);
-	return NULL;
-}
-
-/* Pullback "phase" over phase->tiling->shift_phase, which shifts
- * phase 0 to phase 1.  The pullback therefore takes a phase 1
- * description and turns it into a phase 0 description.
- */
-static __isl_give ppcg_ht_phase *shift_phase(__isl_take ppcg_ht_phase *phase)
-{
-	ppcg_ht_tiling *tiling;
-
-	if (!phase)
-		return NULL;
-
-	tiling = phase->tiling;
-	return pullback_phase(phase, isl_multi_aff_copy(tiling->shift_phase));
-}
-
-/* Take a "phase" defined over the ts-space and plug in the projection
- * from the input schedule space to the ts-space.
- * The result is then defined over this input schedule space.
- */
-static __isl_give ppcg_ht_phase *lift_phase(__isl_take ppcg_ht_phase *phase)
-{
-	ppcg_ht_tiling *tiling;
-
-	if (!phase)
-		return NULL;
-
-	tiling = phase->tiling;
-	return pullback_phase(phase, isl_multi_aff_copy(tiling->project_ts));
-}
-
-/* Compute the shift that should be added to the space band
- * in order to be able to apply rectangular tiling to the space.
- * Store the shift in phase->space_shift.
- *
- * In the first dimension, it is equal to shift_space - s.
- * For phase 1, this results in
- *
- *	(-(2 * shift_s)*T) % W
- *
- * In phase 0, the "s" in shift_space has been replaced by "s + shift_s",
- * so the result is
- *
- *	shift_s + (-(2 * shift_s)*T) % W
- *
- * In the other dimensions, the shift is equal to
- *
- *	dl_i * local_time.
- */
-static __isl_give ppcg_ht_phase *compute_space_shift(
-	__isl_take ppcg_ht_phase *phase)
-{
-	int i, n;
-	isl_space *space;
-	isl_local_space *ls;
-	isl_aff *aff, *s;
-	isl_multi_aff *space_shift;
-
-	if (!phase)
-		return NULL;
-
-	space = ppcg_ht_phase_get_input_space(phase);
-	space = isl_space_unwrap(space);
-	space = isl_space_range_map(space);
-
-	space_shift = isl_multi_aff_zero(space);
-	aff = isl_aff_copy(phase->shift_space);
-	ls = isl_local_space_from_space(isl_aff_get_domain_space(aff));
-	s = isl_aff_var_on_domain(ls, isl_dim_set, 1);
-	aff = isl_aff_sub(aff, s);
-	space_shift = isl_multi_aff_set_aff(space_shift, 0, aff);
-
-	n = isl_multi_aff_dim(space_shift, isl_dim_out);
-	for (i = 1; i < n; ++i) {
-		isl_val *v;
-		isl_aff *time;
-
-		v = ppcg_ht_bounds_get_lower(phase->tiling->bounds, i);
-		time = isl_aff_copy(phase->local_time);
-		time = isl_aff_scale_val(time, v);
-		space_shift = isl_multi_aff_set_aff(space_shift, i, time);
-	}
-
-	if (!space_shift)
-		return ppcg_ht_phase_free(phase);
-	phase->space_shift = space_shift;
-	return phase;
-}
-
-/* Compute the space tiling and store the result in phase->space_tile.
- * The space tiling is of the form
- *
- *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size]
- */
-static __isl_give ppcg_ht_phase *compute_space_tile(
-	__isl_take ppcg_ht_phase *phase)
-{
-	isl_space *space;
-	isl_multi_val *space_sizes;
-	isl_multi_aff *space_shift;
-	isl_multi_aff *tile;
-
-	if (!phase)
-		return NULL;
-
-	space = ppcg_ht_phase_get_input_space(phase);
-	space = isl_space_unwrap(space);
-	tile = isl_multi_aff_range_map(space);
-	space_shift = isl_multi_aff_copy(phase->space_shift);
-	tile = isl_multi_aff_add(space_shift, tile);
-	space_sizes = isl_multi_val_copy(phase->tiling->space_sizes);
-	tile = isl_multi_aff_scale_down_multi_val(tile, space_sizes);
-	tile = isl_multi_aff_floor(tile);
-
-	if (!tile)
-		return ppcg_ht_phase_free(phase);
-	phase->space_tile = tile;
-	return phase;
-}
-
-/* Construct a representation for one of the two phase for hybrid tiling
- * "tiling".  If "shift" is not set, then the phase is constructed
- * directly from the hexagonal tile shape in "tiling", which represents
- * the phase-1 tiles.  If "shift" is set, then this tile shape is shifted
- * back over tiling->shift_phase to obtain the phase-0 tiles.
- *
- * First copy data from "tiling", then optionally shift the phase and
- * finally move the tiling from the "ts" space of "tiling" to
- * the space of the input pattern.
- *
- * After the basic phase has been computed, also compute
- * the corresponding space shift.
- */
-static __isl_give ppcg_ht_phase *ppcg_ht_tiling_compute_phase(
-	__isl_keep ppcg_ht_tiling *tiling, int shift)
-{
-	ppcg_ht_phase *phase;
-
-	phase = construct_phase(tiling);
-	if (shift)
-		phase = shift_phase(phase);
-	phase = lift_phase(phase);
-
-	phase = compute_space_shift(phase);
-	phase = compute_space_tile(phase);
-
-	return phase;
-}
-
-/* Consruct a function that is equal to the time tile of "phase0"
- * on the domain of "phase0" and equal to the time tile of "phase1"
- * on the domain of "phase1".
- * The two domains are assumed to form a partition of the input
- * schedule space.
- */
-static __isl_give isl_pw_multi_aff *combine_time_tile(
-	__isl_keep ppcg_ht_phase *phase0, __isl_keep ppcg_ht_phase *phase1)
-{
-	isl_aff *T;
-	isl_pw_aff *time, *time1;
-
-	if (!phase0 || !phase1)
-		return NULL;
-
-	T = isl_aff_copy(phase0->time_tile);
-	time = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase0), T);
-
-	T = isl_aff_copy(phase1->time_tile);
-	time1 = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase1), T);
-
-	time = isl_pw_aff_union_add(time, time1);
-
-	return isl_pw_multi_aff_from_pw_aff(time);
-}
-
-/* Name used in mark nodes that contain a pointer to a ppcg_ht_phase.
- */
-static char *ppcg_phase_name = "phase";
-
-/* Does "id" contain a pointer to a ppcg_ht_phase?
- * That is, is it called "phase"?
- */
-static isl_bool is_phase_id(__isl_keep isl_id *id)
-{
-	const char *name;
-
-	name = isl_id_get_name(id);
-	if (!name)
-		return isl_bool_error;
-
-	return !strcmp(name, ppcg_phase_name);
-}
-
-/* Given a mark node with an identifier that points to a ppcg_ht_phase,
- * extract this ppcg_ht_phase pointer.
- */
-__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
-	__isl_keep isl_schedule_node *node)
-{
-	isl_bool is_phase;
-	isl_id *id;
-	void *p;
-
-	if (!node)
-		return NULL;
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
-		isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
-			"not a phase mark", return NULL);
-
-	id = isl_schedule_node_mark_get_id(node);
-	is_phase = is_phase_id(id);
-	p = isl_id_get_user(id);
-	isl_id_free(id);
-
-	if (is_phase < 0)
-		return NULL;
-	if (!is_phase)
-		isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
-			"not a phase mark", return NULL);
-
-	return p;
-}
-
-/* Insert a mark node at "node" holding a pointer to "phase".
- */
-static __isl_give isl_schedule_node *insert_phase(
-	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_phase *phase)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-
-	if (!node)
-		goto error;
-	ctx = isl_schedule_node_get_ctx(node);
-	id = isl_id_alloc(ctx, ppcg_phase_name, phase);
-	if (!id)
-		goto error;
-	id = isl_id_set_free_user(id, &ppcg_ht_phase_free_wrap);
-	node = isl_schedule_node_insert_mark(node, id);
-
-	return node;
-error:
-	ppcg_ht_phase_free(phase);
-	isl_schedule_node_free(node);
-	return NULL;
-}
-
-/* Construct a mapping from the elements of the original pair of bands
- * to which tiling was applied that belong to a tile of "phase"
- * to that tile, preserving the values for the outer bands.
- *
- * The mapping is of the form
- *
- *	[[outer] -> [P -> C]] -> [[outer] -> [tile]]
- *
- * where tile is defined by a concatenation of the time_tile and
- * the space_tile.
- */
-static __isl_give isl_map *construct_tile_map(__isl_keep ppcg_ht_phase *phase)
-{
-	int depth;
-	isl_space *space;
-	isl_multi_aff *ma;
-	isl_multi_aff *tiling;
-	isl_map *el2tile;
-
-	depth = isl_schedule_node_get_schedule_depth(
-						phase->tiling->input_node);
-	space = isl_aff_get_space(phase->time_tile);
-	space = isl_space_params(space);
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, depth);
-	space = isl_space_map_from_set(space);
-	ma = isl_multi_aff_identity(space);
-
-	tiling = isl_multi_aff_flat_range_product(
-		isl_multi_aff_from_aff(isl_aff_copy(phase->time_tile)),
-		isl_multi_aff_copy(phase->space_tile));
-	el2tile = isl_map_from_multi_aff(tiling);
-	el2tile = isl_map_intersect_domain(el2tile,
-						isl_set_copy(phase->domain));
-	el2tile = isl_map_product(isl_map_from_multi_aff(ma), el2tile);
-
-	return el2tile;
-}
-
-/* Return a description of the full tiles of "phase" at the point
- * in the original schedule tree where the tiling was applied.
- *
- * First construct a mapping from the input schedule dimensions
- * up to an including the original pair of bands to which hybrid tiling
- * was applied to schedule dimensions in which this original pair
- * has been replaced by the tiles.
- * This mapping is of the form
- *
- *	[[outer] -> [P -> C]] -> [[outer] -> [tile]]
- *
- * Apply this mapping to the set of all values for the input
- * schedule dimensions and then apply its inverse.
- * The result is the set of values for the input schedule dimensions
- * that would map to any of the tiles.  Subtracting from this set
- * the set of values that are actually executed produces the set
- * of values that belong to a tile but that are not executed.
- * Mapping these back to the tiles produces a description of
- * the partial tiles.  Subtracting these from the set of all tiles
- * produces a description of the full tiles in the form
- *
- *	[[outer] -> [tile]]
- */
-static __isl_give isl_set *compute_full_tile(__isl_keep ppcg_ht_phase *phase)
-{
-	isl_schedule_node *node;
-	isl_union_set *domain;
-	isl_union_map *prefix, *schedule;
-	isl_set *all, *partial, *all_el;
-	isl_map *tile2el, *el2tile;
-	isl_multi_union_pw_aff *mupa;
-
-	el2tile = construct_tile_map(phase);
-	tile2el = isl_map_reverse(isl_map_copy(el2tile));
-
-	node = phase->tiling->input_node;
-	prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
-	domain = isl_schedule_node_get_domain(node);
-	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
-	schedule = isl_union_map_from_multi_union_pw_aff(mupa);
-	schedule = isl_union_map_range_product(prefix, schedule);
-	all_el = isl_set_from_union_set(isl_union_set_apply(domain, schedule));
-	all_el = isl_set_coalesce(all_el);
-
-	all = isl_set_apply(isl_set_copy(all_el), isl_map_copy(el2tile));
-
-	partial = isl_set_copy(all);
-	partial = isl_set_apply(partial, tile2el);
-	partial = isl_set_subtract(partial, all_el);
-	partial = isl_set_apply(partial, el2tile);
-
-	return isl_set_subtract(all, partial);
-}
-
-/* Copy the AST loop types of the non-isolated part to those
- * of the isolated part.
- */
-static __isl_give isl_schedule_node *set_isolate_loop_type(
-	__isl_take isl_schedule_node *node)
-{
-	int i, n;
-
-	n = isl_schedule_node_band_n_member(node);
-	for (i = 0; i < n; ++i) {
-		enum isl_ast_loop_type type;
-
-		type = isl_schedule_node_band_member_get_ast_loop_type(node, i);
-		node = isl_schedule_node_band_member_set_isolate_ast_loop_type(
-								node, i, type);
-	}
-
-	return node;
-}
-
-/* If options->isolate_full_tiles is set, then mark the full tiles
- * in "node" for isolation.  The full tiles are derived from "phase".
- * "node" may point to a part of the tiling, e.g., the space tiling.
- *
- * The full tiles are originally computed in the form
- *
- *	[[outer] -> [tile]]
- *
- * However, the band that "node" points to may only contain
- * subset of the tile dimensions.
- * The description above is therefore treated as
- *
- *	[[outer] -> [before; this; after]]
- *
- * before is of size "pos"; this is of size "dim"; and
- * after is of size "out - pos - dim".
- * The after part is first project out.  Then the range is split
- * into a before and this part and finally the before part is moved
- * to the domain, resulting in
- *
- *	[[outer; before] -> [this]]
- *
- * This description is then used as the isolate option.
- *
- * The AST loop type for the isolated part is set to be the same
- * as that of the non-isolated part.
- */
-static __isl_give isl_schedule_node *ppcg_ht_phase_isolate_full_tile_node(
-	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node,
-	struct ppcg_options *options)
-{
-	int in, out, pos, depth, dim;
-	isl_space *space;
-	isl_multi_aff *ma1, *ma2;
-	isl_set *tile;
-	isl_map *map;
-	isl_set *set;
-	isl_union_set *opt;
-
-	if (!options->isolate_full_tiles)
-		return node;
-
-	depth = isl_schedule_node_get_schedule_depth(node);
-	dim = isl_schedule_node_band_n_member(node);
-
-	tile = compute_full_tile(phase);
-	map = isl_set_unwrap(tile);
-	in = isl_map_dim(map, isl_dim_in);
-	out = isl_map_dim(map, isl_dim_out);
-	pos = depth - in;
-	map = isl_map_project_out(map, isl_dim_out, pos + dim,
-				out - (pos + dim));
-	space = isl_space_range(isl_map_get_space(map));
-	ma1 = isl_multi_aff_project_out_map(isl_space_copy(space),
-					   isl_dim_set, pos, dim);
-	ma2 = isl_multi_aff_project_out_map(space, isl_dim_set, 0, pos);
-	ma1 = isl_multi_aff_range_product(ma1, ma2);
-	map = isl_map_apply_range(map, isl_map_from_multi_aff(ma1));
-	map = isl_map_uncurry(map);
-	map = isl_map_flatten_domain(map);
-	set = isl_map_wrap(map);
-	set = isl_set_set_tuple_name(set, "isolate");
-
-	opt = isl_schedule_node_band_get_ast_build_options(node);
-	opt = isl_union_set_add_set(opt, set);
-	node = isl_schedule_node_band_set_ast_build_options(node, opt);
-	node = set_isolate_loop_type(node);
-
-	return node;
-}
-
-/* Insert a band node for performing the space tiling for "phase" at "node".
- * In particular, insert a band node with partial schedule
- *
- *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size)]
- *
- * pulled back over the input schedule.
- * "options" determines whether full tiles should be separated
- * from partial tiles.
- *
- * The first tile dimension iterates over the hexagons in the same
- * phase, which are independent by construction.  The first dimension
- * is therefore marked coincident.
- * All dimensions are also marked for being generated as atomic loops
- * because separation is usually not desirable on tile loops.
- */
-static __isl_give isl_schedule_node *insert_space_tiling(
-	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node,
-	struct ppcg_options *options)
-{
-	isl_multi_aff *space_tile;
-	isl_multi_union_pw_aff *mupa;
-
-	if (!phase)
-		return isl_schedule_node_free(node);
-
-	space_tile = isl_multi_aff_copy(phase->space_tile);
-	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
-	mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_tile);
-	node = isl_schedule_node_insert_partial_schedule(node, mupa);
-	node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
-	node = ppcg_ht_phase_isolate_full_tile_node(phase, node, options);
-	node = isl_schedule_node_band_member_set_coincident(node, 0, 1);
-
-	return node;
-}
-
-/* Given a pointer "node" to (a copy of) the original child node
- * in the input pattern, adjust its partial schedule such that
- * it starts at zero within each tile.
- *
- * That is, replace "s" by (s + space_shift) % space_sizes.
- */
-__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
-	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node)
-{
-	isl_multi_val *space_sizes;
-	isl_multi_aff *space_shift;
-	isl_multi_union_pw_aff *mupa;
-
-	space_shift = isl_multi_aff_copy(phase->space_shift);
-	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
-	mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_shift);
-	node = isl_schedule_node_band_shift(node, mupa);
-	space_sizes = isl_multi_val_copy(phase->tiling->space_sizes);
-	node = isl_schedule_node_band_mod(node, space_sizes);
-
-	return node;
-}
-
-/* Does
- *
- *	s0 > delta + 2 * {delta * h} - 1
- *
- * hold?
- */
-static isl_bool wide_enough(__isl_keep isl_val *s0, __isl_keep isl_val *delta,
-	__isl_keep isl_val *h)
-{
-	isl_val *v, *v2;
-	isl_bool ok;
-
-	v = isl_val_mul(isl_val_copy(delta), isl_val_copy(h));
-	v2 = isl_val_floor(isl_val_copy(v));
-	v = isl_val_sub(v, v2);
-	v = isl_val_mul_ui(v, 2);
-	v = isl_val_add(v, isl_val_copy(delta));
-	v = isl_val_sub_ui(v, 1);
-	ok = isl_val_gt(s0, v);
-	isl_val_free(v);
-
-	return ok;
-}
-
-/* Is the tile size specified by "sizes" wide enough in the first space
- * dimension, i.e., the base of the hexagon?  This ensures that,
- * after hybrid tiling using "bounds" and these sizes,
- * neighboring hexagons in the same phase are far enough apart
- * that they do not depend on each other.
- * The test is only meaningful if the bounds are valid.
- *
- * Let st be (half) the size in the time dimension and s0 the base
- * size in the first space dimension.  Let delta be the dependence
- * distance in either positive or negative direction.  In principle,
- * it should be enough to have s0 + 1 > delta, i.e., s0 >= delta.
- * However, in case of fractional delta, the tile is not extended
- * with delta * (st - 1), but instead with floor(delta * (st - 1)).
- * The condition therefore needs to be adjusted to
- *
- *	s0 + 1 > delta + 2 {delta * (st - 1)}
- *
- * (with {} the fractional part) to account for the two slanted sides.
- * The condition in the paper "Hybrid Hexagonal/Classical Tiling for GPUs"
- * translates to
- *
- *	s0 >= delta + {delta * (st - 1)}
- *
- * Since 1 > frac(delta * (st - 1)), this condition implies
- * the condition above.
- *
- * The condition is checked for both directions.
- */
-isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
-	__isl_keep isl_multi_val *sizes)
-{
-	isl_val *s0, *h;
-	isl_val *delta;
-	isl_bool ok;
-
-	ok = ppcg_ht_bounds_is_valid(bounds);
-	if (ok < 0 || !ok)
-		return ok;
-
-	h = isl_val_sub_ui(isl_multi_val_get_val(sizes, 0), 1);
-	s0 = isl_multi_val_get_val(sizes, 1);
-
-	delta = ppcg_ht_bounds_get_lower(bounds, 0);
-	ok = wide_enough(s0, delta, h);
-	isl_val_free(delta);
-
-	delta = ppcg_ht_bounds_get_upper(bounds);
-	if (ok == isl_bool_true)
-		ok = wide_enough(s0, delta, h);
-	isl_val_free(delta);
-
-	isl_val_free(s0);
-	isl_val_free(h);
-
-	return ok;
-}
-
-/* Check that the tile will be wide enough in the first space
- * dimension, i.e., the base of the hexagon.  This ensures that
- * neighboring hexagons in the same phase are far enough apart
- * that they do not depend on each other.
- *
- * Error out if the condition fails to hold.
- */
-static isl_stat check_width(__isl_keep ppcg_ht_bounds *bounds,
-	__isl_keep isl_multi_val *sizes)
-{
-	isl_bool ok;
-
-	ok = ppcg_ht_bounds_supports_sizes(bounds, sizes);
-
-	if (ok < 0)
-		return isl_stat_error;
-	if (!ok)
-		isl_die(isl_multi_val_get_ctx(sizes), isl_error_invalid,
-			"base of hybrid tiling hexagon not sufficiently wide",
-			return isl_stat_error);
-
-	return isl_stat_ok;
-}
-
-/* Given valid bounds on the relative dependence distances for
- * the pair of nested nodes that "node" point to, as well as sufficiently
- * wide tile sizes "sizes", insert the corresponding time and space tiling
- * at "node", along with a pair of phase nodes that can be used
- * to make further changes.
- * The space of "sizes" should be the product of the spaces
- * of the schedules of the pair of parent and child nodes.
- * "options" determines whether full tiles should be separated
- * from partial tiles.
- *
- * In particular, given an input of the form
- *
- *	P - C - ...
- *
- * the output has the form
- *
- *	        /- F0 - M0 - CT0 - P - C - ...
- *	PT - seq
- *	        \- F1 - M1 - CT1 - P - C - ...
- *
- * PT is the global time tiling.  Within each of these tiles,
- * two phases are executed in order.  Within each phase, the schedule
- * space is further subdivided into tiles through CT0 and CT1.
- * The first dimension of each of these iterates over the hexagons
- * within a phase and these are independent by construction.
- * The F0 and F1 filters filter the statement instances that belong
- * to the corresponding phase.  The M0 and M1 marks contain a pointer
- * to a ppcg_ht_phase object that can be used to perform further changes.
- *
- * After checking that input satisfies the requirements,
- * a data structure is constructed that represents the tiling and
- * two additional data structures are constructed for the two phases
- * of the tiling.  These are then used to define the filters F0 and F1 and
- * combined to construct the time tiling PT.
- * Then the time tiling node PT is inserted, followed by
- * the sequence with the two filters, the CT space tiling nodes and
- * the phase markers M.
- */
-__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
-	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
-	__isl_take isl_schedule_node *node, struct ppcg_options *options)
-{
-	isl_ctx *ctx;
-	isl_union_set *phase0;
-	isl_union_set *phase1;
-	isl_multi_union_pw_aff *input, *dom_time;
-	isl_union_pw_multi_aff *upma;
-	isl_pw_multi_aff *time;
-	isl_union_set_list *phases;
-	ppcg_ht_tiling *tiling;
-	ppcg_ht_phase *phase_0;
-	ppcg_ht_phase *phase_1;
-
-	if (!node || !sizes || !bounds)
-		goto error;
-	if (check_input_pattern(node) < 0 || check_width(bounds, sizes) < 0)
-		goto error;
-
-	ctx = isl_schedule_node_get_ctx(node);
-
-	input = extract_input_schedule(node);
-
-	tiling = ppcg_ht_bounds_construct_tiling(bounds, node, input, sizes);
-	phase_0 = ppcg_ht_tiling_compute_phase(tiling, 1);
-	phase_1 = ppcg_ht_tiling_compute_phase(tiling, 0);
-	time = combine_time_tile(phase_0, phase_1);
-	ppcg_ht_tiling_free(tiling);
-
-	upma = isl_union_pw_multi_aff_from_multi_union_pw_aff(
-					isl_multi_union_pw_aff_copy(input));
-	phase0 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_0));
-	phase0 = isl_union_set_preimage_union_pw_multi_aff(phase0,
-					isl_union_pw_multi_aff_copy(upma));
-	phase1 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_1));
-	phase1 = isl_union_set_preimage_union_pw_multi_aff(phase1, upma);
-
-	phases = isl_union_set_list_alloc(ctx, 2);
-	phases = isl_union_set_list_add(phases, phase0);
-	phases = isl_union_set_list_add(phases, phase1);
-
-	dom_time = isl_multi_union_pw_aff_apply_pw_multi_aff(input, time);
-	node = isl_schedule_node_insert_partial_schedule(node, dom_time);
-
-	node = isl_schedule_node_child(node, 0);
-
-	node = isl_schedule_node_insert_sequence(node, phases);
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_child(node, 0);
-	node = insert_space_tiling(phase_0, node, options);
-	node = insert_phase(node, phase_0);
-	node = isl_schedule_node_parent(node);
-	node = isl_schedule_node_next_sibling(node);
-	node = isl_schedule_node_child(node, 0);
-	node = insert_space_tiling(phase_1, node, options);
-	node = insert_phase(node, phase_1);
-	node = isl_schedule_node_parent(node);
-	node = isl_schedule_node_parent(node);
-
-	node = isl_schedule_node_parent(node);
-
-	isl_multi_val_free(sizes);
-	return node;
-error:
-	isl_multi_val_free(sizes);
-	isl_schedule_node_free(node);
-	ppcg_ht_bounds_free(bounds);
-	return NULL;
-}
-
-/* Given a branch "node" that contains a sequence node with two phases
- * of hybrid tiling as input, call "fn" on each of the two phase marker
- * nodes.
- *
- * That is, the input is as follows
- *
- *	         /- F0 - M0 - ...
- *	... - seq
- *	         \- F1 - M1 - ...
- *
- * and "fn" is called on M0 and on M1.
- */
-__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
-	__isl_take isl_schedule_node *node,
-	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
-		void *user), void *user)
-{
-	int depth0, depth;
-
-	depth0 = isl_schedule_node_get_tree_depth(node);
-
-	while (node &&
-	    isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
-		node = isl_schedule_node_child(node, 0);
-
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_child(node, 0);
-	if (!node)
-		return NULL;
-	node = fn(node, user);
-	node = isl_schedule_node_parent(node);
-	node = isl_schedule_node_next_sibling(node);
-	node = isl_schedule_node_child(node, 0);
-	if (!node)
-		return NULL;
-	node = fn(node, user);
-	node = isl_schedule_node_parent(node);
-	node = isl_schedule_node_parent(node);
-
-	depth = isl_schedule_node_get_tree_depth(node);
-	node = isl_schedule_node_ancestor(node, depth - depth0);
-
-	return node;
-}
-
-/* This function is called on each of the two phase marks
- * in a hybrid tiling tree.
- * Drop the phase mark at "node".
- */
-static __isl_give isl_schedule_node *drop_phase_mark(
-	__isl_take isl_schedule_node *node, void *user)
-{
-	isl_id *id;
-	isl_bool is_phase;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
-		return node;
-
-	id = isl_schedule_node_mark_get_id(node);
-	is_phase = is_phase_id(id);
-	isl_id_free(id);
-
-	if (is_phase < 0)
-		return isl_schedule_node_free(node);
-	if (is_phase)
-		node = isl_schedule_node_delete(node);
-
-	return node;
-}
-
-/* Given a branch "node" that contains a sequence node with two phases
- * of hybrid tiling as input, remove the two phase marker nodes.
- *
- * That is, the input is as follows
- *
- *	         /- F0 - M0 - ...
- *	... - seq
- *	         \- F1 - M1 - ...
- *
- * and the output is
- *
- *	         /- F0 - ...
- *	... - seq
- *	         \- F1 - ...
- */
-__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
-	__isl_take isl_schedule_node *node)
-{
-	return hybrid_tile_foreach_phase(node, &drop_phase_mark, NULL);
-}
diff --git a/polly/lib/External/ppcg/ocl_utilities.h b/polly/lib/External/ppcg/ocl_utilities.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/ocl_utilities.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef OCL_UTILITIES_H
-#define OCL_UTILITIES_H
-
-#if defined(__APPLE__)
-#include <OpenCL/opencl.h>
-#else
-#include <CL/opencl.h>
-#endif
-
-/* Return the OpenCL error string for a given error number.
- */
-const char *opencl_error_string(cl_int error);
-
-/* Find a GPU or a CPU associated with the first available platform.
- * If use_gpu is set, then this function first tries to look for a GPU
- * in the first available platform.
- * If this fails or if use_gpu is not set, then it tries to use the CPU.
- */
-cl_device_id opencl_create_device(int use_gpu);
-
-/* Create an OpenCL program from a string and compile it.
- */
-cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
-	const char *program_source, size_t program_size,
-	const char *opencl_options);
-
-/* Create an OpenCL program from a source file and compile it.
- */
-cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
-	const char* filename, const char* opencl_options);
-
-#endif
diff --git a/polly/lib/External/ppcg/ocl_utilities.c b/polly/lib/External/ppcg/ocl_utilities.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/ocl_utilities.c
+++ /dev/null
@@ -1,174 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "ocl_utilities.h"
-
-/* Return the OpenCL error string for a given error number.
- */
-const char *opencl_error_string(cl_int error)
-{
-	int errorCount;
-	int index;
-
-	static const char *errorString[] = {
-		[CL_SUCCESS] = "CL_SUCCESS",
-		[-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
-		[-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
-		[-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
-		[-CL_MEM_OBJECT_ALLOCATION_FAILURE] =
-			"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		[-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
-		[-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
-		[-CL_PROFILING_INFO_NOT_AVAILABLE] =
-			"CL_PROFILING_INFO_NOT_AVAILABLE",
-		[-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
-		[-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
-		[-CL_IMAGE_FORMAT_NOT_SUPPORTED] =
-			"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		[-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
-		[-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
-		[-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
-		[-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
-		[-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
-		[-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
-		[-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
-		[-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
-		[-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
-		[-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
-		[-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT",
-		[-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] =
-			"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		[-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
-		[-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
-		[-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
-		[-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
-		[-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
-		[-CL_INVALID_PROGRAM_EXECUTABLE] =
-			"CL_INVALID_PROGRAM_EXECUTABLE",
-		[-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
-		[-CL_INVALID_KERNEL_DEFINITION] =
-			"CL_INVALID_KERNEL_DEFINITION",
-		[-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
-		[-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
-		[-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
-		[-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
-		[-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
-		[-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
-		[-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
-		[-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
-		[-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
-		[-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
-		[-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
-		[-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
-		[-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
-		[-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
-		[-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
-		[-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
-		[-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
-	};
-
-	errorCount = sizeof(errorString) / sizeof(errorString[0]);
-	index = -error;
-
-	return (index >= 0 && index < errorCount) ?
-		errorString[index] : "Unspecified Error";
-}
-
-/* Find a GPU or a CPU associated with the first available platform.
- * If use_gpu is set, then this function first tries to look for a GPU
- * in the first available platform.
- * If this fails or if use_gpu is not set, then it tries to use the CPU.
- */
-cl_device_id opencl_create_device(int use_gpu)
-{
-	cl_platform_id platform;
-	cl_device_id dev;
-	int err;
-
-	err = clGetPlatformIDs(1, &platform, NULL);
-	if (err < 0) {
-		fprintf(stderr, "Error %s while looking for a platform.\n",
-				opencl_error_string(err));
-		exit(1);
-	}
-
-	err = CL_DEVICE_NOT_FOUND;
-	if (use_gpu)
-		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev,
-				NULL);
-	if (err == CL_DEVICE_NOT_FOUND)
-		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev,
-				NULL);
-	if (err < 0) {
-		fprintf(stderr, "Error %s while looking for a device.\n",
-				opencl_error_string(err));
-		exit(1);
-	}
-	return dev;
-}
-
-/* Create an OpenCL program from a string and compile it.
- */
-cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
-	const char *program_source, size_t program_size,
-	const char *opencl_options)
-{
-	int err;
-	cl_program program;
-	char *program_log;
-	size_t log_size;
-
-	program = clCreateProgramWithSource(ctx, 1,
-			&program_source, &program_size, &err);
-	if (err < 0) {
-		fprintf(stderr, "Could not create the program\n");
-		exit(1);
-	}
-	err = clBuildProgram(program, 0, NULL, opencl_options, NULL, NULL);
-	if (err < 0) {
-		fprintf(stderr, "Could not build the program.\n");
-		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0,
-				NULL, &log_size);
-		program_log = (char *) malloc(log_size + 1);
-		program_log[log_size] = '\0';
-		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
-				log_size + 1, program_log, NULL);
-		fprintf(stderr, "%s\n", program_log);
-		free(program_log);
-		exit(1);
-	}
-	return program;
-}
-
-/* Create an OpenCL program from a source file and compile it.
- */
-cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
-	const char* filename, const char* opencl_options)
-{
-	cl_program program;
-	FILE *program_file;
-	char *program_source;
-	size_t program_size, read;
-
-	program_file = fopen(filename, "r");
-	if (program_file == NULL) {
-		fprintf(stderr, "Could not find the source file.\n");
-		exit(1);
-	}
-	fseek(program_file, 0, SEEK_END);
-	program_size = ftell(program_file);
-	rewind(program_file);
-	program_source = (char *) malloc(program_size + 1);
-	program_source[program_size] = '\0';
-	read = fread(program_source, sizeof(char), program_size, program_file);
-	if (read != program_size) {
-		fprintf(stderr, "Error while reading the kernel.\n");
-		exit(1);
-	}
-	fclose(program_file);
-
-	program = opencl_build_program_from_string(ctx, dev, program_source,
-						program_size, opencl_options);
-	free(program_source);
-
-	return program;
-}
diff --git a/polly/lib/External/ppcg/opencl.h b/polly/lib/External/ppcg/opencl.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/opencl.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _OPENCL_H
-#define _OPENCL_H
-
-#include <pet.h>
-#include "ppcg_options.h"
-#include "ppcg.h"
-
-int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input, const char *output);
-
-#endif
diff --git a/polly/lib/External/ppcg/opencl_test.sh.in b/polly/lib/External/ppcg/opencl_test.sh.in
deleted file mode 100644
--- a/polly/lib/External/ppcg/opencl_test.sh.in
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/sh
-
-keep=no
-
-for option; do
-	case "$option" in
-		--keep)
-			keep=yes
-			;;
-	esac
-done
-
-EXEEXT=@EXEEXT@
-VERSION=@GIT_HEAD_VERSION@
-CC="@CC@"
-CFLAGS="--std=gnu99"
-srcdir="@srcdir@"
-
-if [ $keep = "yes" ]; then
-	OUTDIR="opencl_test.$VERSION"
-	mkdir "$OUTDIR" || exit 1
-else
-	if test "x$TMPDIR" = "x"; then
-		TMPDIR=/tmp
-	fi
-	OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
-fi
-
-run_tests () {
-	subdir=$1
-	ppcg_options=$2
-
-	echo Test with PPCG options \'$ppcg_options\'
-	mkdir ${OUTDIR}/${subdir} || exit 1
-	for i in $srcdir/tests/*.c; do
-		echo $i
-		name=`basename $i`
-		name="${name%.c}"
-		out_c="${OUTDIR}/${subdir}/$name.ppcg.c"
-		out="${OUTDIR}/${subdir}/$name.ppcg$EXEEXT"
-		options="--target=opencl --opencl-no-use-gpu $ppcg_options"
-		functions="$srcdir/tests/${name}_opencl_functions.cl"
-		if test -f $functions; then
-			options="$options --opencl-include-file=$functions"
-			options="$options --opencl-compiler-options=-I."
-		fi
-		./ppcg$EXEEXT $options $i -o "$out_c" || exit
-		$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
-			-I. "$out_c" -o "$out" || exit
-		$out || exit
-	done
-}
-
-run_tests default
-run_tests embed --opencl-embed-kernel-code
-
-for i in $srcdir/examples/*.c; do
-	echo $i
-	name=`basename $i`
-	name="${name%.c}"
-	exe_ref="${OUTDIR}/$name.ref$EXEEXT"
-	gen_ocl="${OUTDIR}/$name.ppcg.c"
-	exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT"
-	output_ref="${OUTDIR}/$name.ref.out"
-	output_ocl="${OUTDIR}/$name.ppcg.out"
-	$CC $CFLAGS $i -o $exe_ref || exit
-	./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \
-		exit
-	$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
-		"$gen_ocl" -o "$exe_ocl" || exit
-	$exe_ref > $output_ref || exit
-	$exe_ocl > $output_ocl || exit
-	cmp $output_ref $output_ocl || exit
-done
-
-if [ $keep = "no" ]; then
-	rm -r "${OUTDIR}"
-fi
diff --git a/polly/lib/External/ppcg/polybench_test.sh.in b/polly/lib/External/ppcg/polybench_test.sh.in
deleted file mode 100644
--- a/polly/lib/External/ppcg/polybench_test.sh.in
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/bin/sh
-
-keep=no
-verbose=no
-
-for option; do
-	case "$option" in
-		--keep)
-			keep=yes
-			;;
-		--verbose)
-			verbose=yes
-			;;
-	esac
-done
-
-EXEEXT=@EXEEXT@
-DIR=@POLYBENCH_DIR@
-VERSION=@GIT_HEAD_VERSION@
-SIZE=-DMINI_DATASET
-CC="@CC@"
-HAVE_OPENCL=@HAVE_OPENCL@
-HAVE_OPENMP=@HAVE_OPENMP@
-srcdir="@srcdir@"
-if [ $keep = "yes" ]; then
-	OUTDIR="out.$VERSION"
-	mkdir "$OUTDIR" || exit 1
-else
-	if test "x$TMPDIR" = "x"; then
-		TMPDIR=/tmp
-	fi
-	OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
-fi
-CPPFLAGS="-DPOLYBENCH_USE_C99_PROTO -DPOLYBENCH_DUMP_ARRAYS"
-CPPFLAGS="$CPPFLAGS $SIZE -I $DIR/utilities"
-CFLAGS="-lm --std=gnu99"
-
-echo "Running tests in folder ${OUTDIR}"
-
-run_tests () {
-	ext=$1
-
-	ppcg_options=$2
-	cc_options=$3
-
-	if [ "x$ppcg_options" = "x" ]; then
-		ppcg_option_str="none"
-	else
-		ppcg_option_str=$ppcg_options
-	fi
-
-	if [ "x$cc_options" = "x" ]; then
-		cc_option_str="none"
-	else
-		cc_option_str=$cc_options
-	fi
-
-	echo Test: $ext, ppcg options: $ppcg_option_str, CC options: $cc_option_str
-	for i in `cat $DIR/utilities/benchmark_list`; do
-		echo $i
-		name=`basename $i`
-		name=${name%.c}
-		source_opt="${OUTDIR}/$name.$ext.c"
-		prog_orig=${OUTDIR}/$name.orig${EXEEXT}
-		prog_opt=${OUTDIR}/$name.$ext${EXEEXT}
-		output_orig=${OUTDIR}/$name.orig.out
-		output_opt=${OUTDIR}/$name.$ext.out
-		dir=`dirname $i`
-		if [ $verbose = "yes" ]; then
-			echo ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i \
-				$CPPFLAGS -o $source_opt $ppcg_options
-		fi
-		./ppcg$EXEEXT -I $DIR/$dir $DIR/$i $CPPFLAGS \
-			-o $source_opt $ppcg_options || exit
-		$CC -I $DIR/$dir $CPPFLAGS $DIR/$i -o $prog_orig \
-			$DIR/utilities/polybench.c $CFLAGS
-		$prog_orig 2> $output_orig
-		if [ $verbose = "yes" ]; then
-			echo $CC -I $DIR/$dir $CPPFLAGS $source_opt \
-				-o $prog_opt $DIR/utilities/polybench.c \
-				$CFLAGS $cc_options
-		fi
-		$CC -I $DIR/$dir $CPPFLAGS $source_opt -o $prog_opt \
-			$DIR/utilities/polybench.c $CFLAGS $cc_options || exit
-
-		$prog_opt 2> $output_opt
-		cmp $output_orig $output_opt || exit
-	done
-}
-
-run_tests ppcg "--target=c --tile"
-run_tests ppcg_live "--target=c --no-live-range-reordering --tile"
-
-# Test OpenMP code, if compiler supports openmp
-if [ $HAVE_OPENMP = "yes" ]; then
-	run_tests ppcg_omp "--target=c --openmp" -fopenmp
-	echo Introduced `grep -R 'omp parallel' "${OUTDIR}" | wc -l` '"pragma omp parallel for"'
-else
-	echo Compiler does not support OpenMP. Skipping OpenMP tests.
-fi
-
-if [ $HAVE_OPENCL = "yes" ]; then
-	run_tests ppcg_opencl "--target=opencl --opencl-no-use-gpu" \
-				"-I $srcdir $srcdir/ocl_utilities.c -lOpenCL"
-fi
-
-if [ $keep = "no" ]; then
-	rm -r "${OUTDIR}"
-fi
diff --git a/polly/lib/External/ppcg/ppcg.h b/polly/lib/External/ppcg/ppcg.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/ppcg.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef PPCG_H
-#define PPCG_H
-
-#include <isl/schedule.h>
-#include <isl/set.h>
-#include <isl/union_set.h>
-#include <isl/union_map.h>
-#include <isl/id_to_ast_expr.h>
-#include <pet.h>
-
-#include "ppcg_options.h"
-
-const char *ppcg_base_name(const char *filename);
-int ppcg_extract_base_name(char *name, const char *input);
-
-/* Representation of the scop for use inside PPCG.
- *
- * "options" are the options specified by the user.
- * Some fields in this structure may depend on some of the options.
- *
- * "start" and "end" are file offsets of the corresponding program text.
- * "context" represents constraints on the parameters.
- * "domain" is the union of all iteration domains.
- * "call" contains the iteration domains of statements with a call expression.
- * "reads" contains all potential read accesses.
- * "tagged_reads" is the same as "reads", except that the domain is a wrapped
- *	relation mapping an iteration domain to a reference identifier
- * "live_in" contains the potential read accesses that potentially
- *	have no corresponding writes in the scop.
- * "may_writes" contains all potential write accesses.
- * "tagged_may_writes" is the same as "may_writes", except that the domain
- *	is a wrapped relation mapping an iteration domain
- *	to a reference identifier
- * "must_writes" contains all definite write accesses.
- * "tagged_must_writes" is the same as "must_writes", except that the domain
- *	is a wrapped relation mapping an iteration domain
- *	to a reference identifier
- * "live_out" contains the potential write accesses that are potentially
- *	not killed by any kills or any other writes.
- * "must_kills" contains all definite kill accesses.
- * "tagged_must_kills" is the same as "must_kills", except that the domain
- *	is a wrapped relation mapping an iteration domain
- *	to a reference identifier.
- *
- * "tagger" maps tagged iteration domains to the corresponding untagged
- *	iteration domain.
- *
- * "independence" is the union of all independence filters.
- *
- * "dep_flow" represents the potential flow dependences.
- * "tagged_dep_flow" is the same as "dep_flow", except that both domain and
- *	range are wrapped relations mapping an iteration domain to
- *	a reference identifier.  May be NULL if not computed.
- * "dep_false" represents the potential false (anti and output) dependences.
- * "dep_forced" represents the validity constraints that should be enforced
- *	even when live-range reordering is used.
- *	In particular, these constraints ensure that all live-in
- *	accesses remain live-in and that all live-out accesses remain live-out
- *	and that multiple potential sources for the same read are
- *	executed in the original order.
- * "dep_order"/"tagged_dep_order" represents the order dependences between
- *	the live range intervals in "dep_flow"/"tagged_dep_flow".
- *	It is only used if the live_range_reordering
- *	option is set.  Otherwise it is NULL.
- *	If "dep_order" is used, then "dep_false" only contains a limited
- *	set of anti and output dependences.
- * "schedule" represents the (original) schedule.
- *
- * "names" contains all variable names that are in use by the scop.
- * The names are mapped to a dummy value.
- *
- * "pet" is the original pet_scop.
- */
-struct ppcg_scop {
-	struct ppcg_options *options;
-
-	unsigned start;
-	unsigned end;
-
-	isl_set *context;
-	isl_union_set *domain;
-	isl_union_set *call;
-	isl_union_map *tagged_reads;
-	isl_union_map *reads;
-	isl_union_map *live_in;
-	isl_union_map *tagged_may_writes;
-	isl_union_map *may_writes;
-	isl_union_map *tagged_must_writes;
-	isl_union_map *must_writes;
-	isl_union_map *live_out;
-	isl_union_map *tagged_must_kills;
-	isl_union_map *must_kills;
-
-	isl_union_pw_multi_aff *tagger;
-
-	isl_union_map *independence;
-
-	isl_union_map *dep_flow;
-	isl_union_map *tagged_dep_flow;
-	isl_union_map *dep_false;
-	isl_union_map *dep_forced;
-	isl_union_map *dep_order;
-	isl_union_map *tagged_dep_order;
-	isl_schedule *schedule;
-
-	isl_id_to_ast_expr *names;
-
-	struct pet_scop *pet;
-};
-
-int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop);
-__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop,
-	int n, const char *prefix);
-
-int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out,
-	struct ppcg_options *options,
-	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
-		struct ppcg_scop *scop, void *user), void *user);
-
-__isl_give isl_schedule *ppcg_compute_schedule(
-	__isl_take isl_schedule_constraints *sc,
-	__isl_keep isl_schedule *schedule, struct ppcg_options *options);
-
-void compute_tagger(struct ppcg_scop *ps);
-void compute_dependences(struct ppcg_scop *scop);
-void eliminate_dead_code(struct ppcg_scop *ps);
-void *ppcg_scop_free(struct ppcg_scop *ps);
-#endif
diff --git a/polly/lib/External/ppcg/ppcg.c b/polly/lib/External/ppcg/ppcg.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/ppcg.c
+++ /dev/null
@@ -1,1067 +0,0 @@
-/*
- * Copyright 2011      INRIA Saclay
- * Copyright 2013      Ecole Normale Superieure
- * Copyright 2015      Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <isl/ctx.h>
-#include <isl/id.h>
-#include <isl/val.h>
-#include <isl/set.h>
-#include <isl/union_set.h>
-#include <isl/union_map.h>
-#include <isl/aff.h>
-#include <isl/flow.h>
-#include <isl/options.h>
-#include <isl/schedule.h>
-#include <isl/ast.h>
-#include <isl/id_to_ast_expr.h>
-#include <isl/ast_build.h>
-#include <isl/schedule.h>
-#include <pet.h>
-#include "ppcg.h"
-#include "ppcg_options.h"
-#include "cuda.h"
-#include "opencl.h"
-#include "cpu.h"
-
-struct options {
-	struct pet_options *pet;
-	struct ppcg_options *ppcg;
-	char *input;
-	char *output;
-};
-
-const char *ppcg_version(void);
-static void print_version(void)
-{
-	printf("%s", ppcg_version());
-}
-
-ISL_ARGS_START(struct options, options_args)
-ISL_ARG_CHILD(struct options, pet, "pet", &pet_options_args, "pet options")
-ISL_ARG_CHILD(struct options, ppcg, NULL, &ppcg_options_args, "ppcg options")
-ISL_ARG_STR(struct options, output, 'o', NULL,
-	"filename", NULL, "output filename (c and opencl targets)")
-ISL_ARG_ARG(struct options, input, "input", NULL)
-ISL_ARG_VERSION(print_version)
-ISL_ARGS_END
-
-ISL_ARG_DEF(options, struct options, options_args)
-
-/* Return a pointer to the final path component of "filename" or
- * to "filename" itself if it does not contain any components.
- */
-const char *ppcg_base_name(const char *filename)
-{
-	const char *base;
-
-	base = strrchr(filename, '/');
-	if (base)
-		return ++base;
-	else
-		return filename;
-}
-
-/* Copy the base name of "input" to "name" and return its length.
- * "name" is not NULL terminated.
- *
- * In particular, remove all leading directory components and
- * the final extension, if any.
- */
-int ppcg_extract_base_name(char *name, const char *input)
-{
-	const char *base;
-	const char *ext;
-	int len;
-
-	base = ppcg_base_name(input);
-	ext = strrchr(base, '.');
-	len = ext ? ext - base : strlen(base);
-
-	memcpy(name, base, len);
-
-	return len;
-}
-
-/* Does "scop" refer to any arrays that are declared, but not
- * exposed to the code after the scop?
- */
-int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop)
-{
-	int i;
-
-	if (!scop)
-		return 0;
-
-    // This is a pet feature not available in Polly.
-    return 0;
-
-	for (i = 0; i < scop->pet->n_array; ++i)
-		if (scop->pet->arrays[i]->declared &&
-		    !scop->pet->arrays[i]->exposed)
-			return 1;
-
-	return 0;
-}
-
-/* Collect all variable names that are in use in "scop".
- * In particular, collect all parameters in the context and
- * all the array names.
- * Store these names in an isl_id_to_ast_expr by mapping
- * them to a dummy value (0).
- */
-static __isl_give isl_id_to_ast_expr *collect_names(struct pet_scop *scop)
-{
-	int i, n;
-	isl_ctx *ctx;
-	isl_ast_expr *zero;
-	isl_id_to_ast_expr *names;
-
-	ctx = isl_set_get_ctx(scop->context);
-
-	n = isl_set_dim(scop->context, isl_dim_param);
-
-	names = isl_id_to_ast_expr_alloc(ctx, n + scop->n_array);
-	zero = isl_ast_expr_from_val(isl_val_zero(ctx));
-
-	for (i = 0; i < n; ++i) {
-		isl_id *id;
-
-		id = isl_set_get_dim_id(scop->context, isl_dim_param, i);
-		names = isl_id_to_ast_expr_set(names,
-						id, isl_ast_expr_copy(zero));
-	}
-
-	for (i = 0; i < scop->n_array; ++i) {
-		struct pet_array *array = scop->arrays[i];
-		isl_id *id;
-
-		id = isl_set_get_tuple_id(array->extent);
-		names = isl_id_to_ast_expr_set(names,
-						id, isl_ast_expr_copy(zero));
-	}
-
-	isl_ast_expr_free(zero);
-
-	return names;
-}
-
-/* Return an isl_id called "prefix%d", with "%d" set to "i".
- * If an isl_id with such a name already appears among the variable names
- * of "scop", then adjust the name to "prefix%d_%d".
- */
-static __isl_give isl_id *generate_name(struct ppcg_scop *scop,
-	const char *prefix, int i)
-{
-	int j;
-	char name[16];
-	isl_ctx *ctx;
-	isl_id *id;
-	int has_name;
-
-	ctx = isl_set_get_ctx(scop->context);
-	snprintf(name, sizeof(name), "%s%d", prefix, i);
-	id = isl_id_alloc(ctx, name, NULL);
-
-	j = 0;
-	while ((has_name = isl_id_to_ast_expr_has(scop->names, id)) == 1) {
-		isl_id_free(id);
-		snprintf(name, sizeof(name), "%s%d_%d", prefix, i, j++);
-		id = isl_id_alloc(ctx, name, NULL);
-	}
-
-	return has_name < 0 ? isl_id_free(id) : id;
-}
-
-/* Return a list of "n" isl_ids of the form "prefix%d".
- * If an isl_id with such a name already appears among the variable names
- * of "scop", then adjust the name to "prefix%d_%d".
- */
-__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop,
-	int n, const char *prefix)
-{
-	int i;
-	isl_ctx *ctx;
-	isl_id_list *names;
-
-	ctx = isl_set_get_ctx(scop->context);
-	names = isl_id_list_alloc(ctx, n);
-	for (i = 0; i < n; ++i) {
-		isl_id *id;
-
-		id = generate_name(scop, prefix, i);
-		names = isl_id_list_add(names, id);
-	}
-
-	return names;
-}
-
-/* Is "stmt" not a kill statement?
- */
-static int is_not_kill(struct pet_stmt *stmt)
-{
-	return !pet_stmt_is_kill(stmt);
-}
-
-/* Collect the iteration domains of the statements in "scop" that
- * satisfy "pred".
- */
-static __isl_give isl_union_set *collect_domains(struct pet_scop *scop,
-	int (*pred)(struct pet_stmt *stmt))
-{
-	int i;
-	isl_set *domain_i;
-	isl_union_set *domain;
-
-	if (!scop)
-		return NULL;
-
-	domain = isl_union_set_empty(isl_set_get_space(scop->context));
-
-	for (i = 0; i < scop->n_stmt; ++i) {
-		struct pet_stmt *stmt = scop->stmts[i];
-
-		if (!pred(stmt))
-			continue;
-
-		if (stmt->n_arg > 0)
-			isl_die(isl_union_set_get_ctx(domain),
-				isl_error_unsupported,
-				"data dependent conditions not supported",
-				return isl_union_set_free(domain));
-
-		domain_i = isl_set_copy(scop->stmts[i]->domain);
-		domain = isl_union_set_add_set(domain, domain_i);
-	}
-
-	return domain;
-}
-
-/* Collect the iteration domains of the statements in "scop",
- * skipping kill statements.
- */
-static __isl_give isl_union_set *collect_non_kill_domains(struct pet_scop *scop)
-{
-	return collect_domains(scop, &is_not_kill);
-}
-
-/* This function is used as a callback to pet_expr_foreach_call_expr
- * to detect if there is any call expression in the input expression.
- * Assign the value 1 to the integer that "user" points to and
- * abort the search since we have found what we were looking for.
- */
-static int set_has_call(__isl_keep pet_expr *expr, void *user)
-{
-	int *has_call = user;
-
-	*has_call = 1;
-
-	return -1;
-}
-
-/* Does "expr" contain any call expressions?
- */
-static int expr_has_call(__isl_keep pet_expr *expr)
-{
-	int has_call = 0;
-
-	if (pet_expr_foreach_call_expr(expr, &set_has_call, &has_call) < 0 &&
-	    !has_call)
-		return -1;
-
-	return has_call;
-}
-
-/* This function is a callback for pet_tree_foreach_expr.
- * If "expr" contains any call (sub)expressions, then set *has_call
- * and abort the search.
- */
-static int check_call(__isl_keep pet_expr *expr, void *user)
-{
-	int *has_call = user;
-
-	if (expr_has_call(expr))
-		*has_call = 1;
-
-	return *has_call ? -1 : 0;
-}
-
-/* Does "stmt" contain any call expressions?
- */
-static int has_call(struct pet_stmt *stmt)
-{
-	int has_call = 0;
-
-	if (pet_tree_foreach_expr(stmt->body, &check_call, &has_call) < 0 &&
-	    !has_call)
-		return -1;
-
-	return has_call;
-}
-
-/* Collect the iteration domains of the statements in "scop"
- * that contain a call expression.
- */
-static __isl_give isl_union_set *collect_call_domains(struct pet_scop *scop)
-{
-	return collect_domains(scop, &has_call);
-}
-
-/* Given a union of "tagged" access relations of the form
- *
- *	[S_i[...] -> R_j[]] -> A_k[...]
- *
- * project out the "tags" (R_j[]).
- * That is, return a union of relations of the form
- *
- *	S_i[...] -> A_k[...]
- */
-static __isl_give isl_union_map *project_out_tags(
-	__isl_take isl_union_map *umap)
-{
-	return isl_union_map_domain_factor_domain(umap);
-}
-
-/* Construct a function from tagged iteration domains to the corresponding
- * untagged iteration domains with as range of the wrapped map in the domain
- * the reference tags that appear in any of the reads, writes or kills.
- * Store the result in ps->tagger.
- *
- * For example, if the statement with iteration space S[i,j]
- * contains two array references R_1[] and R_2[], then ps->tagger will contain
- *
- *	{ [S[i,j] -> R_1[]] -> S[i,j]; [S[i,j] -> R_2[]] -> S[i,j] }
- */
-void compute_tagger(struct ppcg_scop *ps)
-{
-	isl_union_map *tagged;
-	isl_union_pw_multi_aff *tagger;
-
-	tagged = isl_union_map_copy(ps->tagged_reads);
-	tagged = isl_union_map_union(tagged,
-				isl_union_map_copy(ps->tagged_may_writes));
-	tagged = isl_union_map_union(tagged,
-				isl_union_map_copy(ps->tagged_must_kills));
-	tagged = isl_union_map_universe(tagged);
-	tagged = isl_union_set_unwrap(isl_union_map_domain(tagged));
-
-	tagger = isl_union_map_domain_map_union_pw_multi_aff(tagged);
-
-	ps->tagger = tagger;
-}
-
-/* Compute the live out accesses, i.e., the writes that are
- * potentially not killed by any kills or any other writes, and
- * store them in ps->live_out.
- *
- * We compute the "dependence" of any "kill" (an explicit kill
- * or a must write) on any may write.
- * The elements accessed by the may writes with a "depending" kill
- * also accessing the element are definitely killed.
- * The remaining may writes can potentially be live out.
- *
- * The result of the dependence analysis is
- *
- *	{ IW -> [IK -> A] }
- *
- * with IW the instance of the write statement, IK the instance of kill
- * statement and A the element that was killed.
- * The range factor range is
- *
- *	{ IW -> A }
- *
- * containing all such pairs for which there is a kill statement instance,
- * i.e., all pairs that have been killed.
- */
-static void compute_live_out(struct ppcg_scop *ps)
-{
-	isl_schedule *schedule;
-	isl_union_map *kills;
-	isl_union_map *exposed;
-	isl_union_map *covering;
-	isl_union_access_info *access;
-	isl_union_flow *flow;
-
-	schedule = isl_schedule_copy(ps->schedule);
-	kills = isl_union_map_union(isl_union_map_copy(ps->must_writes),
-				    isl_union_map_copy(ps->must_kills));
-	access = isl_union_access_info_from_sink(kills);
-	access = isl_union_access_info_set_may_source(access,
-				    isl_union_map_copy(ps->may_writes));
-	access = isl_union_access_info_set_schedule(access, schedule);
-	flow = isl_union_access_info_compute_flow(access);
-	covering = isl_union_flow_get_full_may_dependence(flow);
-	isl_union_flow_free(flow);
-
-	covering = isl_union_map_range_factor_range(covering);
-	exposed = isl_union_map_copy(ps->may_writes);
-	exposed = isl_union_map_subtract(exposed, covering);
-	ps->live_out = exposed;
-}
-
-/* Compute the tagged flow dependences and the live_in accesses and store
- * the results in ps->tagged_dep_flow and ps->live_in.
- *
- * We allow both the must writes and the must kills to serve as
- * definite sources such that a subsequent read would not depend
- * on any earlier write.  The resulting flow dependences with
- * a must kill as source reflect possibly uninitialized reads.
- * No dependences need to be introduced to protect such reads
- * (other than those imposed by potential flows from may writes
- * that follow the kill).  We therefore remove those flow dependences.
- * This is also useful for the dead code elimination, which assumes
- * the flow sources are non-kill instances.
- */
-static void compute_tagged_flow_dep_only(struct ppcg_scop *ps)
-{
-	isl_union_pw_multi_aff *tagger;
-	isl_schedule *schedule;
-	isl_union_map *live_in;
-	isl_union_access_info *access;
-	isl_union_flow *flow;
-	isl_union_map *must_source;
-	isl_union_map *kills;
-	isl_union_map *tagged_flow;
-
-	tagger = isl_union_pw_multi_aff_copy(ps->tagger);
-	schedule = isl_schedule_copy(ps->schedule);
-	schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger);
-	kills = isl_union_map_copy(ps->tagged_must_kills);
-	must_source = isl_union_map_copy(ps->tagged_must_writes);
-	must_source = isl_union_map_union(must_source,
-				isl_union_map_copy(kills));
-	access = isl_union_access_info_from_sink(
-				isl_union_map_copy(ps->tagged_reads));
-	access = isl_union_access_info_set_must_source(access, must_source);
-	access = isl_union_access_info_set_may_source(access,
-				isl_union_map_copy(ps->tagged_may_writes));
-	access = isl_union_access_info_set_schedule(access, schedule);
-	flow = isl_union_access_info_compute_flow(access);
-	tagged_flow = isl_union_flow_get_may_dependence(flow);
-	tagged_flow = isl_union_map_subtract_domain(tagged_flow,
-				isl_union_map_domain(kills));
-	ps->tagged_dep_flow = tagged_flow;
-	live_in = isl_union_flow_get_may_no_source(flow);
-	ps->live_in = project_out_tags(live_in);
-	isl_union_flow_free(flow);
-}
-
-/* Compute ps->dep_flow from ps->tagged_dep_flow
- * by projecting out the reference tags.
- */
-static void derive_flow_dep_from_tagged_flow_dep(struct ppcg_scop *ps)
-{
-	ps->dep_flow = isl_union_map_copy(ps->tagged_dep_flow);
-	ps->dep_flow = isl_union_map_factor_domain(ps->dep_flow);
-}
-
-/* Compute the flow dependences and the live_in accesses and store
- * the results in ps->dep_flow and ps->live_in.
- * A copy of the flow dependences, tagged with the reference tags
- * is stored in ps->tagged_dep_flow.
- *
- * We first compute ps->tagged_dep_flow, i.e., the tagged flow dependences
- * and then project out the tags.
- */
-static void compute_tagged_flow_dep(struct ppcg_scop *ps)
-{
-	compute_tagged_flow_dep_only(ps);
-	derive_flow_dep_from_tagged_flow_dep(ps);
-}
-
-/* Compute the order dependences that prevent the potential live ranges
- * from overlapping.
- *
- * In particular, construct a union of relations
- *
- *	[R[...] -> R_1[]] -> [W[...] -> R_2[]]
- *
- * where [R[...] -> R_1[]] is the range of one or more live ranges
- * (i.e., a read) and [W[...] -> R_2[]] is the domain of one or more
- * live ranges (i.e., a write).  Moreover, the read and the write
- * access the same memory element and the read occurs before the write
- * in the original schedule.
- * The scheduler allows some of these dependences to be violated, provided
- * the adjacent live ranges are all local (i.e., their domain and range
- * are mapped to the same point by the current schedule band).
- *
- * Note that if a live range is not local, then we need to make
- * sure it does not overlap with _any_ other live range, and not
- * just with the "previous" and/or the "next" live range.
- * We therefore add order dependences between reads and
- * _any_ later potential write.
- *
- * We also need to be careful about writes without a corresponding read.
- * They are already prevented from moving past non-local preceding
- * intervals, but we also need to prevent them from moving past non-local
- * following intervals.  We therefore also add order dependences from
- * potential writes that do not appear in any intervals
- * to all later potential writes.
- * Note that dead code elimination should have removed most of these
- * dead writes, but the dead code elimination may not remove all dead writes,
- * so we need to consider them to be safe.
- *
- * The order dependences are computed by computing the "dataflow"
- * from the above unmatched writes and the reads to the may writes.
- * The unmatched writes and the reads are treated as may sources
- * such that they would not kill order dependences from earlier
- * such writes and reads.
- */
-static void compute_order_dependences(struct ppcg_scop *ps)
-{
-	isl_union_map *reads;
-	isl_union_map *shared_access;
-	isl_union_set *matched;
-	isl_union_map *unmatched;
-	isl_union_pw_multi_aff *tagger;
-	isl_schedule *schedule;
-	isl_union_access_info *access;
-	isl_union_flow *flow;
-
-	tagger = isl_union_pw_multi_aff_copy(ps->tagger);
-	schedule = isl_schedule_copy(ps->schedule);
-	schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger);
-	reads = isl_union_map_copy(ps->tagged_reads);
-	matched = isl_union_map_domain(isl_union_map_copy(ps->tagged_dep_flow));
-	unmatched = isl_union_map_copy(ps->tagged_may_writes);
-	unmatched = isl_union_map_subtract_domain(unmatched, matched);
-	reads = isl_union_map_union(reads, unmatched);
-	access = isl_union_access_info_from_sink(
-				isl_union_map_copy(ps->tagged_may_writes));
-	access = isl_union_access_info_set_may_source(access, reads);
-	access = isl_union_access_info_set_schedule(access, schedule);
-	flow = isl_union_access_info_compute_flow(access);
-	shared_access = isl_union_flow_get_may_dependence(flow);
-	isl_union_flow_free(flow);
-
-	ps->tagged_dep_order = isl_union_map_copy(shared_access);
-	ps->dep_order = isl_union_map_factor_domain(shared_access);
-}
-
-/* Compute those validity dependences of the program represented by "scop"
- * that should be unconditionally enforced even when live-range reordering
- * is used.
- *
- * In particular, compute the external false dependences
- * as well as order dependences between sources with the same sink.
- * The anti-dependences are already taken care of by the order dependences.
- * The external false dependences are only used to ensure that live-in and
- * live-out data is not overwritten by any writes inside the scop.
- * The independences are removed from the external false dependences,
- * but not from the order dependences between sources with the same sink.
- *
- * In particular, the reads from live-in data need to precede any
- * later write to the same memory element.
- * As to live-out data, the last writes need to remain the last writes.
- * That is, any earlier write in the original schedule needs to precede
- * the last write to the same memory element in the computed schedule.
- * The possible last writes have been computed by compute_live_out.
- * They may include kills, but if the last access is a kill,
- * then the corresponding dependences will effectively be ignored
- * since we do not schedule any kill statements.
- *
- * Note that the set of live-in and live-out accesses may be
- * an overapproximation.  There may therefore be potential writes
- * before a live-in access and after a live-out access.
- *
- * In the presence of may-writes, there may be multiple live-ranges
- * with the same sink, accessing the same memory element.
- * The sources of these live-ranges need to be executed
- * in the same relative order as in the original program
- * since we do not know which of the may-writes will actually
- * perform a write.  Consider all sources that share a sink and
- * that may write to the same memory element and compute
- * the order dependences among them.
- */
-static void compute_forced_dependences(struct ppcg_scop *ps)
-{
-	isl_union_map *shared_access;
-	isl_union_map *exposed;
-	isl_union_map *live_in;
-	isl_union_map *sink_access;
-	isl_union_map *shared_sink;
-	isl_union_access_info *access;
-	isl_union_flow *flow;
-	isl_schedule *schedule;
-
-	exposed = isl_union_map_copy(ps->live_out);
-	schedule = isl_schedule_copy(ps->schedule);
-	access = isl_union_access_info_from_sink(exposed);
-	access = isl_union_access_info_set_may_source(access,
-				isl_union_map_copy(ps->may_writes));
-	access = isl_union_access_info_set_schedule(access, schedule);
-	flow = isl_union_access_info_compute_flow(access);
-	shared_access = isl_union_flow_get_may_dependence(flow);
-	isl_union_flow_free(flow);
-	ps->dep_forced = shared_access;
-
-	schedule = isl_schedule_copy(ps->schedule);
-	access = isl_union_access_info_from_sink(
-				isl_union_map_copy(ps->may_writes));
-	access = isl_union_access_info_set_may_source(access,
-				isl_union_map_copy(ps->live_in));
-	access = isl_union_access_info_set_schedule(access, schedule);
-	flow = isl_union_access_info_compute_flow(access);
-	live_in = isl_union_flow_get_may_dependence(flow);
-	isl_union_flow_free(flow);
-
-	ps->dep_forced = isl_union_map_union(ps->dep_forced, live_in);
-	ps->dep_forced = isl_union_map_subtract(ps->dep_forced,
-				isl_union_map_copy(ps->independence));
-
-	schedule = isl_schedule_copy(ps->schedule);
-	sink_access = isl_union_map_copy(ps->tagged_dep_flow);
-	sink_access = isl_union_map_range_product(sink_access,
-				isl_union_map_copy(ps->tagged_may_writes));
-	sink_access = isl_union_map_domain_factor_domain(sink_access);
-	access = isl_union_access_info_from_sink(
-				isl_union_map_copy(sink_access));
-	access = isl_union_access_info_set_may_source(access, sink_access);
-	access = isl_union_access_info_set_schedule(access, schedule);
-	flow = isl_union_access_info_compute_flow(access);
-	shared_sink = isl_union_flow_get_may_dependence(flow);
-	isl_union_flow_free(flow);
-	ps->dep_forced = isl_union_map_union(ps->dep_forced, shared_sink);
-}
-
-/* Remove independence from the tagged flow dependences.
- * Since the user has guaranteed that source and sink of an independence
- * can be executed in any order, there cannot be a flow dependence
- * between them, so they can be removed from the set of flow dependences.
- * However, if the source of such a flow dependence is a must write,
- * then it may have killed other potential sources, which would have
- * to be recovered if we were to remove those flow dependences.
- * We therefore keep the flow dependences that originate in a must write,
- * even if it corresponds to a known independence.
- */
-static void remove_independences_from_tagged_flow(struct ppcg_scop *ps)
-{
-	isl_union_map *tf;
-	isl_union_set *indep;
-	isl_union_set *mw;
-
-	tf = isl_union_map_copy(ps->tagged_dep_flow);
-	tf = isl_union_map_zip(tf);
-	indep = isl_union_map_wrap(isl_union_map_copy(ps->independence));
-	tf = isl_union_map_intersect_domain(tf, indep);
-	tf = isl_union_map_zip(tf);
-	mw = isl_union_map_domain(isl_union_map_copy(ps->tagged_must_writes));
-	tf = isl_union_map_subtract_domain(tf, mw);
-	ps->tagged_dep_flow = isl_union_map_subtract(ps->tagged_dep_flow, tf);
-}
-
-/* Compute the dependences of the program represented by "scop"
- * in case live range reordering is allowed.
- *
- * We compute the actual live ranges and the corresponding order
- * false dependences.
- *
- * The independences are removed from the flow dependences
- * (provided the source is not a must-write) as well as
- * from the external false dependences (by compute_forced_dependences).
- */
-static void compute_live_range_reordering_dependences(struct ppcg_scop *ps)
-{
-	compute_tagged_flow_dep_only(ps);
-	remove_independences_from_tagged_flow(ps);
-	derive_flow_dep_from_tagged_flow_dep(ps);
-	compute_order_dependences(ps);
-	compute_forced_dependences(ps);
-}
-
-/* Compute the potential flow dependences and the potential live in
- * accesses.
- */
-static void compute_flow_dep(struct ppcg_scop *ps)
-{
-	isl_union_access_info *access;
-	isl_union_flow *flow;
-
-	access = isl_union_access_info_from_sink(isl_union_map_copy(ps->reads));
-	access = isl_union_access_info_set_must_source(access,
-				isl_union_map_copy(ps->must_writes));
-	access = isl_union_access_info_set_may_source(access,
-				isl_union_map_copy(ps->may_writes));
-	access = isl_union_access_info_set_schedule(access,
-				isl_schedule_copy(ps->schedule));
-	flow = isl_union_access_info_compute_flow(access);
-
-	ps->dep_flow = isl_union_flow_get_may_dependence(flow);
-	ps->live_in = isl_union_flow_get_may_no_source(flow);
-	isl_union_flow_free(flow);
-}
-
-/* Compute the dependences of the program represented by "scop".
- * Store the computed potential flow dependences
- * in scop->dep_flow and the reads with potentially no corresponding writes in
- * scop->live_in.
- * Store the potential live out accesses in scop->live_out.
- * Store the potential false (anti and output) dependences in scop->dep_false.
- *
- * If live range reordering is allowed, then we compute a separate
- * set of order dependences and a set of external false dependences
- * in compute_live_range_reordering_dependences.
- */
-void compute_dependences(struct ppcg_scop *scop)
-{
-	isl_union_map *may_source;
-	isl_union_access_info *access;
-	isl_union_flow *flow;
-
-	if (!scop)
-		return;
-
-	compute_live_out(scop);
-
-	if (scop->options->live_range_reordering)
-		compute_live_range_reordering_dependences(scop);
-	else if (scop->options->target != PPCG_TARGET_C)
-		compute_tagged_flow_dep(scop);
-	else
-		compute_flow_dep(scop);
-
-	may_source = isl_union_map_union(isl_union_map_copy(scop->may_writes),
-					isl_union_map_copy(scop->reads));
-	access = isl_union_access_info_from_sink(
-				isl_union_map_copy(scop->may_writes));
-	access = isl_union_access_info_set_must_source(access,
-				isl_union_map_copy(scop->must_writes));
-	access = isl_union_access_info_set_may_source(access, may_source);
-	access = isl_union_access_info_set_schedule(access,
-				isl_schedule_copy(scop->schedule));
-	flow = isl_union_access_info_compute_flow(access);
-
-	scop->dep_false = isl_union_flow_get_may_dependence(flow);
-	scop->dep_false = isl_union_map_coalesce(scop->dep_false);
-	isl_union_flow_free(flow);
-}
-
-/* Eliminate dead code from ps->domain.
- *
- * In particular, intersect both ps->domain and the domain of
- * ps->schedule with the (parts of) iteration
- * domains that are needed to produce the output or for statement
- * iterations that call functions.
- * Also intersect the range of the dataflow dependences with
- * this domain such that the removed instances will no longer
- * be considered as targets of dataflow.
- *
- * We start with the iteration domains that call functions
- * and the set of iterations that last write to an array
- * (except those that are later killed).
- *
- * Then we add those statement iterations that produce
- * something needed by the "live" statements iterations.
- * We keep doing this until no more statement iterations can be added.
- * To ensure that the procedure terminates, we compute the affine
- * hull of the live iterations (bounded to the original iteration
- * domains) each time we have added extra iterations.
- */
-void eliminate_dead_code(struct ppcg_scop *ps)
-{
-	isl_union_set *live;
-	isl_union_map *dep;
-	isl_union_pw_multi_aff *tagger;
-
-	live = isl_union_map_domain(isl_union_map_copy(ps->live_out));
-	if (!isl_union_set_is_empty(ps->call)) {
-		live = isl_union_set_union(live, isl_union_set_copy(ps->call));
-		live = isl_union_set_coalesce(live);
-	}
-
-	dep = isl_union_map_copy(ps->dep_flow);
-	dep = isl_union_map_reverse(dep);
-
-	for (;;) {
-		isl_union_set *extra;
-
-		extra = isl_union_set_apply(isl_union_set_copy(live),
-					    isl_union_map_copy(dep));
-		if (isl_union_set_is_subset(extra, live)) {
-			isl_union_set_free(extra);
-			break;
-		}
-
-		live = isl_union_set_union(live, extra);
-		live = isl_union_set_affine_hull(live);
-		live = isl_union_set_intersect(live,
-					    isl_union_set_copy(ps->domain));
-	}
-
-	isl_union_map_free(dep);
-
-	ps->domain = isl_union_set_intersect(ps->domain,
-						isl_union_set_copy(live));
-	ps->schedule = isl_schedule_intersect_domain(ps->schedule,
-						isl_union_set_copy(live));
-	ps->dep_flow = isl_union_map_intersect_range(ps->dep_flow,
-						isl_union_set_copy(live));
-	tagger = isl_union_pw_multi_aff_copy(ps->tagger);
-	live = isl_union_set_preimage_union_pw_multi_aff(live, tagger);
-	ps->tagged_dep_flow = isl_union_map_intersect_range(ps->tagged_dep_flow,
-						live);
-}
-
-/* Intersect "set" with the set described by "str", taking the NULL
- * string to represent the universal set.
- */
-static __isl_give isl_set *set_intersect_str(__isl_take isl_set *set,
-	const char *str)
-{
-	isl_ctx *ctx;
-	isl_set *set2;
-
-	if (!str)
-		return set;
-
-	ctx = isl_set_get_ctx(set);
-	set2 = isl_set_read_from_str(ctx, str);
-	set = isl_set_intersect(set, set2);
-
-	return set;
-}
-
-void *ppcg_scop_free(struct ppcg_scop *ps)
-{
-	if (!ps)
-		return NULL;
-
-	isl_set_free(ps->context);
-	isl_union_set_free(ps->domain);
-	isl_union_set_free(ps->call);
-	isl_union_map_free(ps->tagged_reads);
-	isl_union_map_free(ps->reads);
-	isl_union_map_free(ps->live_in);
-	isl_union_map_free(ps->tagged_may_writes);
-	isl_union_map_free(ps->tagged_must_writes);
-	isl_union_map_free(ps->may_writes);
-	isl_union_map_free(ps->must_writes);
-	isl_union_map_free(ps->live_out);
-	isl_union_map_free(ps->tagged_must_kills);
-	isl_union_map_free(ps->must_kills);
-	isl_union_map_free(ps->tagged_dep_flow);
-	isl_union_map_free(ps->dep_flow);
-	isl_union_map_free(ps->dep_false);
-	isl_union_map_free(ps->dep_forced);
-	isl_union_map_free(ps->tagged_dep_order);
-	isl_union_map_free(ps->dep_order);
-	isl_schedule_free(ps->schedule);
-	isl_union_pw_multi_aff_free(ps->tagger);
-	isl_union_map_free(ps->independence);
-	isl_id_to_ast_expr_free(ps->names);
-
-	free(ps);
-
-	return NULL;
-}
-
-/* Extract a ppcg_scop from a pet_scop.
- *
- * The constructed ppcg_scop refers to elements from the pet_scop
- * so the pet_scop should not be freed before the ppcg_scop.
- */
-static struct ppcg_scop *ppcg_scop_from_pet_scop(struct pet_scop *scop,
-	struct ppcg_options *options)
-{
-	int i;
-	isl_ctx *ctx;
-	struct ppcg_scop *ps;
-
-	if (!scop)
-		return NULL;
-
-	ctx = isl_set_get_ctx(scop->context);
-
-	ps = isl_calloc_type(ctx, struct ppcg_scop);
-	if (!ps)
-		return NULL;
-
-	ps->names = collect_names(scop);
-	ps->options = options;
-	ps->start = pet_loc_get_start(scop->loc);
-	ps->end = pet_loc_get_end(scop->loc);
-	ps->context = isl_set_copy(scop->context);
-	ps->context = set_intersect_str(ps->context, options->ctx);
-	if (options->non_negative_parameters) {
-		isl_space *space = isl_set_get_space(ps->context);
-		isl_set *nn = isl_set_nat_universe(space);
-		ps->context = isl_set_intersect(ps->context, nn);
-	}
-	ps->domain = collect_non_kill_domains(scop);
-	ps->call = collect_call_domains(scop);
-	ps->tagged_reads = pet_scop_get_tagged_may_reads(scop);
-	ps->reads = pet_scop_get_may_reads(scop);
-	ps->tagged_may_writes = pet_scop_get_tagged_may_writes(scop);
-	ps->may_writes = pet_scop_get_may_writes(scop);
-	ps->tagged_must_writes = pet_scop_get_tagged_must_writes(scop);
-	ps->must_writes = pet_scop_get_must_writes(scop);
-	ps->tagged_must_kills = pet_scop_get_tagged_must_kills(scop);
-	ps->must_kills = pet_scop_get_must_kills(scop);
-	ps->schedule = isl_schedule_copy(scop->schedule);
-	ps->pet = scop;
-	ps->independence = isl_union_map_empty(isl_set_get_space(ps->context));
-	for (i = 0; i < scop->n_independence; ++i)
-		ps->independence = isl_union_map_union(ps->independence,
-			isl_union_map_copy(scop->independences[i]->filter));
-
-	compute_tagger(ps);
-	compute_dependences(ps);
-	eliminate_dead_code(ps);
-
-	if (!ps->context || !ps->domain || !ps->call || !ps->reads ||
-	    !ps->may_writes || !ps->must_writes || !ps->tagged_must_kills ||
-	    !ps->must_kills || !ps->schedule || !ps->independence || !ps->names)
-		return ppcg_scop_free(ps);
-
-	return ps;
-}
-
-/* Internal data structure for ppcg_transform.
- */
-struct ppcg_transform_data {
-	struct ppcg_options *options;
-	__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
-		struct ppcg_scop *scop, void *user);
-	void *user;
-};
-
-/* Should we print the original code?
- * That is, does "scop" involve any data dependent conditions or
- * nested expressions that cannot be handled by pet_stmt_build_ast_exprs?
- */
-static int print_original(struct pet_scop *scop, struct ppcg_options *options)
-{
-	if (!pet_scop_can_build_ast_exprs(scop)) {
-		if (options->debug->verbose)
-			fprintf(stdout, "Printing original code because "
-				"some index expressions cannot currently "
-				"be printed\n");
-		return 1;
-	}
-
-	if (pet_scop_has_data_dependent_conditions(scop)) {
-		if (options->debug->verbose)
-			fprintf(stdout, "Printing original code because "
-				"input involves data dependent conditions\n");
-		return 1;
-	}
-
-	return 0;
-}
-
-/* Callback for pet_transform_C_source that transforms
- * the given pet_scop to a ppcg_scop before calling the
- * ppcg_transform callback.
- *
- * If "scop" contains any data dependent conditions or if we may
- * not be able to print the transformed program, then just print
- * the original code.
- */
-static __isl_give isl_printer *transform(__isl_take isl_printer *p,
-	struct pet_scop *scop, void *user)
-{
-	struct ppcg_transform_data *data = user;
-	struct ppcg_scop *ps;
-
-	if (print_original(scop, data->options)) {
-		p = pet_scop_print_original(scop, p);
-		pet_scop_free(scop);
-		return p;
-	}
-
-	scop = pet_scop_align_params(scop);
-	ps = ppcg_scop_from_pet_scop(scop, data->options);
-
-	p = data->transform(p, ps, data->user);
-
-	ppcg_scop_free(ps);
-	pet_scop_free(scop);
-
-	return p;
-}
-
-/* Transform the C source file "input" by rewriting each scop
- * through a call to "transform".
- * The transformed C code is written to "out".
- *
- * This is a wrapper around pet_transform_C_source that transforms
- * the pet_scop to a ppcg_scop before calling "fn".
- */
-int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out,
-	struct ppcg_options *options,
-	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
-		struct ppcg_scop *scop, void *user), void *user)
-{
-	struct ppcg_transform_data data = { options, fn, user };
-	return pet_transform_C_source(ctx, input, out, &transform, &data);
-}
-
-/* Check consistency of options.
- *
- * Return -1 on error.
- */
-static int check_options(isl_ctx *ctx)
-{
-	struct options *options;
-
-	options = isl_ctx_peek_options(ctx, &options_args);
-	if (!options)
-		isl_die(ctx, isl_error_internal,
-			"unable to find options", return -1);
-
-	if (options->ppcg->openmp &&
-	    !isl_options_get_ast_build_atomic_upper_bound(ctx))
-		isl_die(ctx, isl_error_invalid,
-			"OpenMP requires atomic bounds", return -1);
-
-	return 0;
-}
-
-#if 0
-int main(int argc, char **argv)
-{
-	int r;
-	isl_ctx *ctx;
-	struct options *options;
-
-	options = options_new_with_defaults();
-	assert(options);
-
-	ctx = isl_ctx_alloc_with_options(&options_args, options);
-	ppcg_options_set_target_defaults(options->ppcg);
-	isl_options_set_ast_build_detect_min_max(ctx, 1);
-	isl_options_set_ast_print_macro_once(ctx, 1);
-	isl_options_set_schedule_whole_component(ctx, 0);
-	isl_options_set_schedule_maximize_band_depth(ctx, 1);
-	isl_options_set_schedule_maximize_coincidence(ctx, 1);
-	pet_options_set_encapsulate_dynamic_control(ctx, 1);
-	argc = options_parse(options, argc, argv, ISL_ARG_ALL);
-
-	if (check_options(ctx) < 0)
-		r = EXIT_FAILURE;
-	else if (options->ppcg->target == PPCG_TARGET_CUDA)
-		r = generate_cuda(ctx, options->ppcg, options->input);
-	else if (options->ppcg->target == PPCG_TARGET_OPENCL)
-		r = generate_opencl(ctx, options->ppcg, options->input,
-				options->output);
-	else
-		r = generate_cpu(ctx, options->ppcg, options->input,
-				options->output);
-
-	isl_ctx_free(ctx);
-
-	return r;
-}
-#endif
diff --git a/polly/lib/External/ppcg/ppcg_options.h b/polly/lib/External/ppcg/ppcg_options.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/ppcg_options.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#ifndef PPCG_OPTIONS_H
-#define PPCG_OPTIONS_H
-
-#include <isl/arg.h>
-#include <isl/options.h>
-
-struct ppcg_debug_options {
-	int dump_schedule_constraints;
-	int dump_schedule;
-	int dump_final_schedule;
-	int dump_sizes;
-	int verbose;
-};
-
-struct ppcg_options {
-	struct isl_options *isl;
-	struct ppcg_debug_options *debug;
-
-	/* Group chains of consecutive statements before scheduling. */
-	int group_chains;
-
-	/* Use isl to compute a schedule replacing the original schedule. */
-	int reschedule;
-	int scale_tile_loops;
-	int wrap;
-
-	/* Assume all parameters are non-negative. */
-	int non_negative_parameters;
-	char *ctx;
-	char *sizes;
-
-	/* Perform tiling (C target). */
-	int tile;
-	int tile_size;
-
-	/* Isolate full tiles from partial tiles. */
-	int isolate_full_tiles;
-
-	/* Take advantage of private memory. */
-	int use_private_memory;
-
-	/* Take advantage of shared memory. */
-	int use_shared_memory;
-
-	/* Maximal amount of shared memory. */
-	int max_shared_memory;
-
-	/* The target we generate code for. */
-	int target;
-
-	/* Generate OpenMP macros (C target only). */
-	int openmp;
-
-	/* Linearize all device arrays. */
-	int linearize_device_arrays;
-
-	/* Allow the use of GNU extensions in generated code. */
-	int allow_gnu_extensions;
-
-	/* Allow live range to be reordered. */
-	int live_range_reordering;
-
-	/* Allow hybrid tiling whenever a suitable input pattern is found. */
-	int hybrid;
-
-	/* Unroll the code for copying to/from shared memory. */
-	int unroll_copy_shared;
-	/* Unroll code inside tile on GPU targets. */
-	int unroll_gpu_tile;
-
-	/* Options to pass to the OpenCL compiler.  */
-	char *opencl_compiler_options;
-	/* Prefer GPU device over CPU. */
-	int opencl_use_gpu;
-	/* Number of files to include. */
-	int opencl_n_include_file;
-	/* Files to include. */
-	const char **opencl_include_files;
-	/* Print definitions of types in kernels. */
-	int opencl_print_kernel_types;
-	/* Embed OpenCL kernel code in host code. */
-	int opencl_embed_kernel_code;
-
-	/* Name of file for saving isl computed schedule or NULL. */
-	char *save_schedule_file;
-	/* Name of file for loading schedule or NULL. */
-	char *load_schedule_file;
-};
-
-ISL_ARG_DECL(ppcg_debug_options, struct ppcg_debug_options,
-	ppcg_debug_options_args)
-ISL_ARG_DECL(ppcg_options, struct ppcg_options, ppcg_options_args)
-
-#define		PPCG_TARGET_C		0
-#define		PPCG_TARGET_CUDA	1
-#define		PPCG_TARGET_OPENCL      2
-
-void ppcg_options_set_target_defaults(struct ppcg_options *options);
-
-#endif
diff --git a/polly/lib/External/ppcg/ppcg_options.c b/polly/lib/External/ppcg/ppcg_options.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/ppcg_options.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2010-2011 INRIA Saclay
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- */
-
-#include "ppcg_options.h"
-
-static struct isl_arg_choice target[] = {
-	{"c",		PPCG_TARGET_C},
-	{"cuda",	PPCG_TARGET_CUDA},
-	{"opencl",      PPCG_TARGET_OPENCL},
-	{0}
-};
-
-/* Set defaults that depend on the target.
- * In particular, set --schedule-outer-coincidence iff target is a GPU.
- */
-void ppcg_options_set_target_defaults(struct ppcg_options *options)
-{
-	char *argv[2] = { NULL };
-
-	argv[0] = "ppcg_options_set_target_defaults";
-	if (options->target == PPCG_TARGET_C)
-		argv[1] = "--no-schedule-outer-coincidence";
-	else
-		argv[1] = "--schedule-outer-coincidence";
-
-	isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL);
-}
-
-/* Callback that is called whenever the "target" option is set (to "val").
- * The callback is called after target has been updated.
- *
- * Call ppcg_options_set_target_defaults to reset the target-dependent options.
- */
-static int set_target(void *opt, unsigned val)
-{
-	struct ppcg_options *options = opt;
-
-	ppcg_options_set_target_defaults(options);
-
-	return 0;
-}
-
-ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args)
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0,
-	"dump-schedule-constraints", 0, "dump schedule constraints")
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule, 0,
-	"dump-schedule", 0, "dump isl computed schedule")
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_final_schedule, 0,
-	"dump-final-schedule", 0, "dump PPCG computed schedule")
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_sizes, 0,
-	"dump-sizes", 0,
-	"dump effectively used per kernel tile, grid and block sizes")
-ISL_ARG_BOOL(struct ppcg_debug_options, verbose, 'v', "verbose", 0, NULL)
-ISL_ARGS_END
-
-ISL_ARGS_START(struct ppcg_options, ppcg_opencl_options_args)
-ISL_ARG_STR(struct ppcg_options, opencl_compiler_options, 0, "compiler-options",
-	"options", NULL, "options to pass to the OpenCL compiler")
-ISL_ARG_BOOL(struct ppcg_options, opencl_use_gpu, 0, "use-gpu", 1,
-	"use GPU device (if available)")
-ISL_ARG_STR_LIST(struct ppcg_options, opencl_n_include_file,
-	opencl_include_files, 0, "include-file", "filename",
-	"file to #include in generated OpenCL code")
-ISL_ARG_BOOL(struct ppcg_options, opencl_print_kernel_types, 0,
-	"print-kernel-types", 1,
-	"print definitions of types in the kernel file")
-ISL_ARG_BOOL(struct ppcg_options, opencl_embed_kernel_code, 0,
-	"embed-kernel-code", 0, "embed kernel code into host code")
-ISL_ARGS_END
-
-ISL_ARGS_START(struct ppcg_options, ppcg_options_args)
-ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options")
-ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args,
-	"debugging options")
-ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1,
-	"group chains of interdependent statements that are executed "
-	"consecutively in the original schedule before scheduling")
-ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1,
-	"replace original schedule by isl computed schedule")
-ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0,
-	"scale-tile-loops", 1, NULL)
-ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL)
-ISL_ARG_BOOL(struct ppcg_options, use_shared_memory, 0, "shared-memory", 1,
-	"use shared memory in kernel code")
-ISL_ARG_BOOL(struct ppcg_options, use_private_memory, 0, "private-memory", 1,
-	"use private memory in kernel code")
-ISL_ARG_STR(struct ppcg_options, ctx, 0, "ctx", "context", NULL,
-    "Constraints on parameters")
-ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0,
-	"assume-non-negative-parameters", 0,
-	"assume all parameters are non-negative)")
-ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0,
-	"perform tiling (C target)")
-ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL)
-ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles",
-	0, "isolate full tiles from partial tiles (hybrid tiling)")
-ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL,
-	"Per kernel tile, grid and block sizes")
-ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0,
-	"max-shared-memory", "size", 8192, "maximal amount of shared memory")
-ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0,
-	"Generate OpenMP macros (only for C target)")
-ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target,
-	&set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA,
-	"the target to generate code for")
-ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0,
-	"linearize-device-arrays", 1,
-	"linearize all device arrays, even those of fixed size")
-ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0,
-	"allow-gnu-extensions", 1,
-	"allow the use of GNU extensions in generated code")
-ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0,
-	"live-range-reordering", 1,
-	"allow successive live ranges on the same memory element "
-	"to be reordered")
-ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0,
-	"apply hybrid tiling whenever a suitable input pattern is found "
-	"(GPU targets)")
-ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared",
-	0, "unroll code for copying to/from shared memory")
-ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0,
-	"unroll code inside tile on GPU targets")
-ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options")
-ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule",
-	"file", NULL, "save isl computed schedule to <file>")
-ISL_ARG_STR(struct ppcg_options, load_schedule_file, 0, "load-schedule",
-	"file", NULL, "load schedule from <file>, "
-	"using it instead of an isl computed schedule")
-ISL_ARGS_END
diff --git a/polly/lib/External/ppcg/print.h b/polly/lib/External/ppcg/print.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/print.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef PRINT_H
-#define PRINT_H
-
-#include <isl/ast.h>
-
-#include "ppcg.h"
-
-extern const char *ppcg_min;
-extern const char *ppcg_max;
-extern const char *ppcg_fdiv_q;
-
-__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p);
-
-__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
-	const char *min, const char *max);
-__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
-	__isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_ast_expr_print_macros(
-	__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
-	__isl_keep isl_id_to_ast_expr *ref2expr);
-__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node);
-
-__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
-	__isl_keep isl_ast_build *build);
-
-__isl_give isl_printer *ppcg_print_declaration_with_size(
-	__isl_take isl_printer *p, const char *base_type,
-	__isl_keep isl_ast_expr *size);
-__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
-	struct pet_array *array, __isl_keep isl_ast_build *build);
-__isl_give isl_printer *ppcg_print_exposed_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop);
-__isl_give isl_printer *ppcg_print_hidden_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop);
-
-#endif
diff --git a/polly/lib/External/ppcg/print.c b/polly/lib/External/ppcg/print.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/print.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * Copyright 2012-2013 Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
- */
-
-#include <isl/aff.h>
-#include <isl/ast_build.h>
-#include <isl/id.h>
-
-#include "print.h"
-#include "util.h"
-
-__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "{");
-	p = isl_printer_end_line(p);
-	p = isl_printer_indent(p, 2);
-	return p;
-}
-
-__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p)
-{
-	p = isl_printer_indent(p, -2);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "}");
-	p = isl_printer_end_line(p);
-	return p;
-}
-
-/* Names of notes that keep track of whether min/max
- * macro definitions have already been printed.
- */
-static const char *ppcg_max_printed = "ppcg_max_printed";
-static const char *ppcg_min_printed = "ppcg_min_printed";
-
-/* Has the macro definition corresponding to "note_name" been printed
- * to "p" before?
- * That is, does "p" have an associated "note_name" note?
- */
-static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_bool printed;
-
-	if (!p)
-		return isl_bool_error;
-
-	ctx = isl_printer_get_ctx(p);
-	id = isl_id_alloc(ctx, note_name, NULL);
-	printed = isl_printer_has_note(p, id);
-	isl_id_free(id);
-
-	return printed;
-}
-
-/* Keep track of the fact that the macro definition corresponding
- * to "note_name" has been printed to "p" by attaching a note with
- * that name.  The value of the note is of no importance, but it
- * has to be a valid isl_id, so the note identifier is reused
- * as the note.
- */
-static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p,
-	const char *note_name)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-
-	if (!p)
-		return NULL;
-
-	ctx = isl_printer_get_ctx(p);
-	id = isl_id_alloc(ctx, note_name, NULL);
-	return isl_printer_set_note(p, id, isl_id_copy(id));
-}
-
-/* Print a macro definition "def" for the macro "name" to "p",
- * unless such a macro definition has been printed to "p" before.
- * "note_name" is used as the name of the note that keeps track
- * of whether this printing has happened.
- */
-static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p,
-	const char *name, const char *def, const char *note_name)
-{
-	isl_bool printed;
-
-	printed = printed_before(p, note_name);
-	if (printed < 0)
-		return isl_printer_free(p);
-	if (printed)
-		return p;
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "#define ");
-	p = isl_printer_print_str(p, name);
-	p = isl_printer_print_str(p, def);
-	p = isl_printer_end_line(p);
-
-	p = mark_printed(p, note_name);
-
-	return p;
-}
-
-/* Structure for keeping track of definitions of some macros.
- */
-struct ppcg_macros {
-	const char *min;
-	const char *max;
-};
-
-/* Free the memory allocated by a struct ppcg_macros.
- */
-static void ppcg_macros_free(void *user)
-{
-	free(user);
-}
-
-/* Default macro definitions (when GNU extensions are allowed).
- */
-struct ppcg_macros ppcg_macros_default = {
-	.min = "(x,y)    "
-		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
-		"_x < _y ? _x : _y; })",
-	.max = "(x,y)    "
-		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
-		"_x > _y ? _x : _y; })",
-};
-
-/* Name used for the note that keeps track of macro definitions.
- */
-static const char *ppcg_macros = "ppcg_macros";
-
-/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max
- * to "min" and "max" and store them in "p".
- *
- * In particular, create a ppcg_macros object and attach it
- * as a note to the printer.
- */
-__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
-	const char *min, const char *max)
-{
-	isl_ctx *ctx;
-	isl_id *id, *macros_id;
-	struct ppcg_macros *macros;
-
-	if (!p)
-		return NULL;
-
-	ctx = isl_printer_get_ctx(p);
-	macros = isl_alloc_type(ctx, struct ppcg_macros);
-	if (!macros)
-		return isl_printer_free(p);
-	macros->min = min;
-	macros->max = max;
-	id = isl_id_alloc(ctx, ppcg_macros, NULL);
-	macros_id = isl_id_alloc(ctx, NULL, macros);
-	if (!macros_id)
-		ppcg_macros_free(macros);
-	else
-		macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free);
-
-	p = isl_printer_set_note(p, id, macros_id);
-
-	return p;
-}
-
-/* Return the ppcg_macros object that holds the currently active
- * macro definitions in "p".
- * If "p" has a note with macro definitions, then return those.
- * Otherwise, return the default macro definitions.
- */
-static struct ppcg_macros *get_macros(__isl_keep isl_printer *p)
-{
-	isl_id *id;
-	isl_bool has_macros;
-	struct ppcg_macros *macros;
-
-	id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL);
-	has_macros = isl_printer_has_note(p, id);
-	if (has_macros < 0 || !has_macros) {
-		isl_id_free(id);
-		if (has_macros < 0)
-			return NULL;
-		return &ppcg_macros_default;
-	}
-	id = isl_printer_get_note(p, id);
-	macros = isl_id_get_user(id);
-	isl_id_free(id);
-
-	return macros;
-}
-
-/* Print the currently active macro definition for ppcg_max.
- */
-static __isl_give isl_printer *print_max(__isl_take isl_printer *p)
-{
-	struct ppcg_macros *macros;
-
-	macros = get_macros(p);
-	if (!macros)
-		return isl_printer_free(p);
-	return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed);
-}
-
-/* Print the currently active macro definition for ppcg_min.
- */
-static __isl_give isl_printer *print_min(__isl_take isl_printer *p)
-{
-	struct ppcg_macros *macros;
-
-	macros = get_macros(p);
-	if (!macros)
-		return isl_printer_free(p);
-	return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed);
-}
-
-/* Print a macro definition for "type" to "p".
- * If GNU extensions are allowed, then print a specialized definition
- * for isl_ast_op_min and isl_ast_op_max.
- * Otherwise, use the default isl definition.
- */
-__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
-	__isl_take isl_printer *p)
-{
-	isl_ctx *ctx;
-	struct ppcg_options *options;
-
-	if (!p)
-		return NULL;
-
-	ctx = isl_printer_get_ctx(p);
-	options = isl_ctx_peek_options(ctx, &ppcg_options_args);
-	if (!options || !options->allow_gnu_extensions)
-		return isl_ast_op_type_print_macro(type, p);
-
-	switch (type) {
-	case isl_ast_op_max:
-		return print_max(p);
-	case isl_ast_op_min:
-		return print_min(p);
-	default:
-		return isl_ast_op_type_print_macro(type, p);
-	}
-}
-
-/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type
- * callback that prints a macro definition for "type".
- */
-static isl_stat print_macro(enum isl_ast_op_type type, void *user)
-{
-	isl_printer **p = user;
-
-	*p = ppcg_print_macro(type, *p);
-	if (!*p)
-		return isl_stat_error;
-
-	return isl_stat_ok;
-}
-
-/* Print the required macros for "expr".
- */
-__isl_give isl_printer *ppcg_ast_expr_print_macros(
-	__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p)
-{
-	if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0)
-		return isl_printer_free(p);
-	return p;
-}
-
-/* isl_id_to_ast_expr_foreach callback that prints the required
- * macro definitions for "val".
- */
-static isl_stat print_expr_macros(__isl_take isl_id *key,
-	__isl_take isl_ast_expr *val, void *user)
-{
-	isl_printer **p = user;
-
-	*p = ppcg_ast_expr_print_macros(val, *p);
-	isl_id_free(key);
-	isl_ast_expr_free(val);
-
-	if (!*p)
-		return isl_stat_error;
-	return isl_stat_ok;
-}
-
-/* Print the required macro definitions for the body of a statement in which
- * the access expressions are replaced by the isl_ast_expr objects
- * in "ref2expr".
- */
-__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
-	__isl_keep isl_id_to_ast_expr *ref2expr)
-{
-	if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0)
-		return isl_printer_free(p);
-	return p;
-}
-
-/* Print the required macros for "node".
- */
-__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node)
-{
-	if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
-		return isl_printer_free(p);
-	return p;
-}
-
-/* Names used for the macros that may appear in a printed isl AST.
- */
-const char *ppcg_min = "ppcg_min";
-const char *ppcg_max = "ppcg_max";
-const char *ppcg_fdiv_q = "ppcg_fdiv_q";
-
-/* Set the names of the macros that may appear in a printed isl AST.
- */
-__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p)
-{
-	p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min);
-	p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max);
-	p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q);
-
-	return p;
-}
-
-/* Given a multi affine expression "mpa" without domain, modify it to have
- * the schedule space of "build" as domain.
- *
- * If the schedule space of "build" is a parameter space, then nothing
- * needs to be done.
- * Otherwise, "mpa" is first given a 0D domain and then it is combined
- * with a mapping from the schedule space of "build" to the same 0D domain.
- */
-__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff(
-	__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build)
-{
-	isl_bool params;
-	isl_space *space;
-	isl_multi_aff *ma;
-
-	space = isl_ast_build_get_schedule_space(build);
-	params = isl_space_is_params(space);
-	if (params < 0 || params) {
-		isl_space_free(space);
-		if (params < 0)
-			return isl_multi_pw_aff_free(mpa);
-		return mpa;
-	}
-	space = isl_space_from_domain(space);
-	ma = isl_multi_aff_zero(space);
-	mpa = isl_multi_pw_aff_from_range(mpa);
-	mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma);
-
-	return mpa;
-}
-
-/* Build an access AST expression from "size" using "build".
- * "size" does not have a domain, but "build" may have a proper schedule space.
- * First modify "size" to have that schedule space as domain.
- */
-__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
-	__isl_keep isl_ast_build *build)
-{
-	size = ppcg_attach_multi_pw_aff(size, build);
-	return isl_ast_build_access_from_multi_pw_aff(build, size);
-}
-
-/* Print a declaration for an array with element type "base_type" and
- * size "size" to "p".
- */
-__isl_give isl_printer *ppcg_print_declaration_with_size(
-	__isl_take isl_printer *p, const char *base_type,
-	__isl_keep isl_ast_expr *size)
-{
-	if (!base_type || !size)
-		return isl_printer_free(p);
-
-	p = ppcg_ast_expr_print_macros(size, p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, base_type);
-	p = isl_printer_print_str(p, " ");
-	p = isl_printer_print_ast_expr(p, size);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Print a declaration for array "array" to "p", using "build"
- * to simplify any size expressions.
- *
- * The size is computed from the extent of the array and is
- * subsequently converted to an "access expression" by "build".
- */
-__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
-	struct pet_array *array, __isl_keep isl_ast_build *build)
-{
-	isl_multi_pw_aff *size;
-	isl_ast_expr *expr;
-
-	if (!array)
-		return isl_printer_free(p);
-
-	size = ppcg_size_from_extent(isl_set_copy(array->extent));
-	expr = isl_ast_build_access_from_multi_pw_aff(build, size);
-	p = ppcg_print_declaration_with_size(p, array->element_type, expr);
-	isl_ast_expr_free(expr);
-
-	return p;
-}
-
-/* Print declarations for the arrays in "scop" that are declared
- * and that are exposed (if exposed == 1) or not exposed (if exposed == 0).
- */
-static __isl_give isl_printer *print_declarations(__isl_take isl_printer *p,
-	struct ppcg_scop *scop, int exposed)
-{
-	int i;
-	isl_ast_build *build;
-
-	if (!scop)
-		return isl_printer_free(p);
-
-	build = isl_ast_build_from_context(isl_set_copy(scop->context));
-	for (i = 0; i < scop->pet->n_array; ++i) {
-		struct pet_array *array = scop->pet->arrays[i];
-
-		if (!array->declared)
-			continue;
-		if (array->exposed != exposed)
-			continue;
-
-		p = ppcg_print_declaration(p, array, build);
-	}
-	isl_ast_build_free(build);
-
-	return p;
-}
-
-/* Print declarations for the arrays in "scop" that are declared
- * and exposed to the code after the scop.
- */
-__isl_give isl_printer *ppcg_print_exposed_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop)
-{
-	return print_declarations(p, scop, 1);
-}
-
-/* Print declarations for the arrays in "scop" that are declared,
- * but not exposed to the code after the scop.
- */
-__isl_give isl_printer *ppcg_print_hidden_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop)
-{
-	return print_declarations(p, scop, 0);
-}
diff --git a/polly/lib/External/ppcg/schedule.h b/polly/lib/External/ppcg/schedule.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/schedule.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _SCHEDULE_H
-#define _SCHEDULE_H
-
-#include <isl/id.h>
-#include <isl/space.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-
-#include "ppcg_options.h"
-
-__isl_give isl_set *parametrization(__isl_take isl_space *space,
-	int len, int first, __isl_keep isl_id_list *names);
-
-__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
-	struct ppcg_options *options,
-	__isl_give isl_schedule *(*compute)(void *user), void *user);
-
-__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
-	__isl_take isl_schedule_node *node, enum isl_ast_loop_type type);
-
-#endif
diff --git a/polly/lib/External/ppcg/schedule.c b/polly/lib/External/ppcg/schedule.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/schedule.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright 2010-2011 INRIA Saclay
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- */
-
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <isl/set.h>
-#include <isl/map.h>
-#include <isl/constraint.h>
-
-#include "schedule.h"
-
-/* Add parameters with identifiers "ids" to "set".
- */
-static __isl_give isl_set *add_params(__isl_take isl_set *set,
-	__isl_keep isl_id_list *ids)
-{
-	int i, n;
-	unsigned nparam;
-
-	n = isl_id_list_n_id(ids);
-
-	nparam = isl_set_dim(set, isl_dim_param);
-	set = isl_set_add_dims(set, isl_dim_param, n);
-
-	for (i = 0; i < n; ++i) {
-		isl_id *id;
-
-		id = isl_id_list_get_id(ids, i);
-		set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
-	}
-
-	return set;
-}
-
-/* Equate the dimensions of "set" starting at "first" to
- * freshly created parameters with identifiers "ids".
- * The number of equated dimensions is equal to the number of elements in "ids".
- */
-static __isl_give isl_set *parametrize(__isl_take isl_set *set,
-	int first, __isl_keep isl_id_list *ids)
-{
-	int i, n;
-	unsigned nparam;
-
-	nparam = isl_set_dim(set, isl_dim_param);
-
-	set = add_params(set, ids);
-
-	n = isl_id_list_n_id(ids);
-	for (i = 0; i < n; ++i)
-		set = isl_set_equate(set, isl_dim_param, nparam + i,
-					isl_dim_set, first + i);
-
-	return set;
-}
-
-/* Given a parameter space "space", create a set of dimension "len"
- * of which the dimensions starting at "first" are equated to
- * freshly created parameters with identifiers "ids".
- */
-__isl_give isl_set *parametrization(__isl_take isl_space *space,
-	int len, int first, __isl_keep isl_id_list *ids)
-{
-	isl_set *set;
-
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, len);
-	set = isl_set_universe(space);
-
-	return parametrize(set, first, ids);
-}
-
-/* Load and return a schedule from a file called "filename".
- */
-static __isl_give isl_schedule *load_schedule(isl_ctx *ctx,
-	const char *filename)
-{
-	FILE *file;
-	isl_schedule *schedule;
-
-	file = fopen(filename, "r");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for reading\n", filename);
-		return NULL;
-	}
-	schedule = isl_schedule_read_from_file(ctx, file);
-	fclose(file);
-
-	return schedule;
-}
-
-/* Save the schedule "schedule" to a file called "filename".
- * The schedule is printed in block style.
- */
-static void save_schedule(__isl_keep isl_schedule *schedule,
-	const char *filename)
-{
-	FILE *file;
-	isl_ctx *ctx;
-	isl_printer *p;
-
-	if (!schedule)
-		return;
-
-	file = fopen(filename, "w");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for writing\n", filename);
-		return;
-	}
-	ctx = isl_schedule_get_ctx(schedule);
-	p = isl_printer_to_file(ctx, file);
-	p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
-	p = isl_printer_print_schedule(p, schedule);
-	isl_printer_free(p);
-	fclose(file);
-}
-
-/* Obtain a schedule, either by reading it form a file
- * or by computing it using "compute".
- * Also take care of saving the computed schedule and/or
- * dumping the obtained schedule if requested by the user.
- */
-__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
-	struct ppcg_options *options,
-	__isl_give isl_schedule *(*compute)(void *user), void *user)
-{
-	isl_schedule *schedule;
-
-	if (options->load_schedule_file) {
-		schedule = load_schedule(ctx, options->load_schedule_file);
-	} else {
-		schedule = compute(user);
-		if (options->save_schedule_file)
-			save_schedule(schedule, options->save_schedule_file);
-	}
-	if (options->debug->dump_schedule)
-		isl_schedule_dump(schedule);
-
-	return schedule;
-}
-
-/* Mark all dimensions in the band node "node" to be of "type".
- */
-__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
-	__isl_take isl_schedule_node *node, enum isl_ast_loop_type type)
-{
-	int i, n;
-
-	n = isl_schedule_node_band_n_member(node);
-	for (i = 0; i < n; ++i)
-		node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
-							type);
-
-	return node;
-}
diff --git a/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c b/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int A[2][1000][1000];
-	int B[2][1000][1000];
-
-#pragma scop
-	{
-		for (int i = 0; i < 256; ++i)
-			for (int j = 0; j < 256; ++j)
-				if (j % 8 <= 2 || j % 8 >= 6)
-					A[1][i][j] = B[1][j][i];
-	}
-#pragma endscop
-
-/* 
-
-When compiled with:
-
-./ppcg tests/allow-sparse-copy-in.c --no-linearize-device-arrays
-	--on-error=abort --sizes='{kernel[i]->tile[8,8]; kernel[i]->block[1,8]}'
-	--max-shared-memory=-1  --unroll-copy-shared
-
-this originally resulted in the following copy-in code:
-
-      shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
-      shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
-      shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
-      shared_B[0][3][t1] = B[1][8 * b1 + 3][8 * b0 + t1];
-      shared_B[0][4][t1] = B[1][8 * b1 + 4][8 * b0 + t1];
-      shared_B[0][5][t1] = B[1][8 * b1 + 5][8 * b0 + t1];
-      shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
-      shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
-
-whereas we only want to only perform copies that are actually needed:
-
-      shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
-      shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
-      shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
-      shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
-      shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
-*/
-	for (int i = 0; i < 100; ++i)
-		if (A[1][0][i] != i)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/call.c b/polly/lib/External/ppcg/tests/call.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/call.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[1000], int a[1000], int pos)
-{
-	b[pos] = 0;
-	int c = a[pos];
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[1000], int a[1000], int pos);
-
-int main()
-{
-	int a[1000], b[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		copy(b, a, i);
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/call2.c b/polly/lib/External/ppcg/tests/call2.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/call2.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[1000], int a[1000], int pos)
-{
-	b[pos] = 0;
-	int c = a[pos];
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[1000], int a[1000], int pos);
-
-int main()
-{
-	int a[2][1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[0][i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		copy(a[1], a[0], i);
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (a[1][i] != a[0][i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/call2_opencl_functions.cl b/polly/lib/External/ppcg/tests/call2_opencl_functions.cl
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/call2_opencl_functions.cl
+++ /dev/null
@@ -1,4 +0,0 @@
-void copy(__global int b[1000], __global int a[1000], int pos)
-{
-	b[pos] = a[pos];
-}
diff --git a/polly/lib/External/ppcg/tests/call3.c b/polly/lib/External/ppcg/tests/call3.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/call3.c
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[100], int a[100])
-{
-	for (int i = 0; i < 100; ++i) {
-		b[i] = 0;
-		int c = a[i];
-	}
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[100], int a[100]);
-
-int main()
-{
-	int A[100][100], B[100];
-
-	for (int i = 0; i < 100; ++i)
-		B[i] = i;
-#pragma scop
-	for (int i = 0; i < 100; ++i)
-		copy(A[i], B);
-#pragma endscop
-	for (int i = 0; i < 100; ++i)
-		for (int j = 0; j < 100; ++j)
-			if (A[j][i] != B[i])
-				return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/call3_opencl_functions.cl b/polly/lib/External/ppcg/tests/call3_opencl_functions.cl
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/call3_opencl_functions.cl
+++ /dev/null
@@ -1,5 +0,0 @@
-void copy(__global int b[100], __global int a[100])
-{
-	for (int i = 0; i < 100; ++i)
-		b[i] = a[i];
-}
diff --git a/polly/lib/External/ppcg/tests/call_opencl_functions.cl b/polly/lib/External/ppcg/tests/call_opencl_functions.cl
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/call_opencl_functions.cl
+++ /dev/null
@@ -1,4 +0,0 @@
-void copy(__global int b[1000], __global int a[1000], int pos)
-{
-	b[pos] = a[pos];
-}
diff --git a/polly/lib/External/ppcg/tests/dead.c b/polly/lib/External/ppcg/tests/dead.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/dead.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int a[1000], b[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i) {
-		int c;
-		int d;
-		c = a[i];
-		d = c;
-		b[i] = c;
-	}
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/iterator.c b/polly/lib/External/ppcg/tests/iterator.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/iterator.c
+++ /dev/null
@@ -1,18 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int i;
-	int a[101];
-
-	i = 0;
-#pragma scop
-	for (i = 0; i < 100; ++i)
-		a[i] = i;
-	a[i] = i;
-#pragma endscop
-	if (a[100] != 100)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/live_out.c b/polly/lib/External/ppcg/tests/live_out.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/live_out.c
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <stdlib.h>
-
-/* Check that a write access is not removed from the live-out
- * accesses only because a strict subset of the (potentially)
- * accessed elements are killed by a later write.
- */
-int main()
-{
-	int A[10];
-
-	A[1] = 0;
-#pragma scop
-	int i = 1;
-	i = i * i;
-	A[i] = 1;
-	A[0] = 0;
-#pragma endscop
-	if (A[1] != 1)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/local.c b/polly/lib/External/ppcg/tests/local.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/local.c
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int A[100];
-
-#pragma scop
-	{
-		int B[100];
-		B[0] = 0;
-		for (int i = 1; i < 100; ++i)
-			B[i] = B[i - 1] + 1;
-		for (int i = 0; i < 100; ++i)
-			A[i] = B[i];
-	}
-#pragma endscop
-	for (int i = 0; i < 100; ++i)
-		if (A[i] != i)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/loop.c b/polly/lib/External/ppcg/tests/loop.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/loop.c
+++ /dev/null
@@ -1,18 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int a[1000], b[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		b[i] = a[i];
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/not_accessed.c b/polly/lib/External/ppcg/tests/not_accessed.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/not_accessed.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[1000], int a[1000], int pos, int c[1000])
-{
-	b[pos] = 0;
-	int d = a[pos];
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[1000], int a[1000], int pos, int c[1000]);
-
-int main()
-{
-	int a[1000], b[1000], c[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		copy(b, a, i, c);
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl b/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl
+++ /dev/null
@@ -1,5 +0,0 @@
-void copy(__global int b[1000], __global int a[1000], int pos,
-	__global int c[1000])
-{
-	b[pos] = a[pos];
-}
diff --git a/polly/lib/External/ppcg/tests/scalar.c b/polly/lib/External/ppcg/tests/scalar.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/scalar.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int a;
-#pragma scop
-	a = 1;
-#pragma endscop
-	if (a != 1)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/shared_sink.c b/polly/lib/External/ppcg/tests/shared_sink.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/shared_sink.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <stdlib.h>
-
-/* Check that the sources of live ranges with the same sink
- * are executed in order.
- */
-int main()
-{
-	int A[128];
-	int n = 128;
-
-	A[0] = 0;
-#pragma scop
-	for (int i = 0; i < n; ++i) {
-		int set = 0;
-		if (A[i] < 2)
-			set = 1;
-		if (set)
-			A[i] = 2;
-	}
-#pragma endscop
-	if (A[0] != 2)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/struct.c b/polly/lib/External/ppcg/tests/struct.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/struct.c
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int c[10][10];
-};
-
-int main()
-{
-	struct s a[10][10], b[10][10];
-
-	for (int i = 0; i < 10; ++i)
-		for (int j = 0; j < 10; ++j)
-			for (int k = 0; k < 10; ++k)
-				for (int l = 0; l < 10; ++l)
-					a[i][j].c[k][l] = i + j + k + l;
-#pragma scop
-	for (int i = 0; i < 10; ++i)
-		for (int j = 0; j < 10; ++j)
-			for (int k = 0; k < 10; ++k)
-				for (int l = 0; l < 10; ++l)
-					b[i][j].c[k][l] = i + j + k + l;
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		for (int j = 0; j < 10; ++j)
-			for (int k = 0; k < 10; ++k)
-				for (int l = 0; l < 10; ++l)
-					if (b[i][j].c[k][l] != a[i][j].c[k][l])
-						return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/struct2.c b/polly/lib/External/ppcg/tests/struct2.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/struct2.c
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int a;
-};
-
-int main()
-{
-	struct s a, b[10];
-
-#pragma scop
-	a.a = 42;
-	for (int i = 0; i < 10; ++i)
-		b[i].a = a.a;
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		if (b[i].a != 42)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/struct3.c b/polly/lib/External/ppcg/tests/struct3.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/struct3.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int a;
-	int b;
-};
-
-int main()
-{
-	struct s a, b[10];
-
-	a.b = 57;
-#pragma scop
-	a.a = 42;
-	for (int i = 0; i < 10; ++i)
-		b[i] = a;
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		if (b[i].a != 42)
-			return EXIT_FAILURE;
-	if (a.b != 57)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/tests/struct4.c b/polly/lib/External/ppcg/tests/struct4.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/tests/struct4.c
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int a;
-	int b;
-};
-
-int main()
-{
-	int a[10];
-
-	for (int i = 0; i < 10; ++i)
-		a[i] = 0;
-#pragma scop
-	for (int i = 0; i < 10; ++i) {
-		struct s b;
-		b.a = 1;
-		b.b = i;
-		a[i] = b.a + b.b;
-	}
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		if (a[i] != 1 + i)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
diff --git a/polly/lib/External/ppcg/util.h b/polly/lib/External/ppcg/util.h
deleted file mode 100644
--- a/polly/lib/External/ppcg/util.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef UTIL_H
-#define UTIL_H
-
-#include <string.h>
-
-#include <isl/space.h>
-#include <isl/val.h>
-
-/* Compare the prefix of "s" to "prefix" up to the length of "prefix".
- */
-static inline int prefixcmp(const char *s, const char *prefix)
-{
-	return strncmp(s, prefix, strlen(prefix));
-}
-
-__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
-	int val);
-__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
-	__isl_take isl_space *space, int *list);
-__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set);
-
-#endif
diff --git a/polly/lib/External/ppcg/util.c b/polly/lib/External/ppcg/util.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/util.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2012-2013 Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <isl/space.h>
-#include <isl/val.h>
-#include <isl/aff.h>
-#include <isl/set.h>
-
-#include "util.h"
-
-/* Construct an isl_multi_val living in "space" with all values equal to "val".
- */
-__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
-	int val)
-{
-	int i, n;
-	isl_ctx *ctx;
-	isl_val *v;
-	isl_multi_val *mv;
-
-	if (!space)
-		return NULL;
-
-	ctx = isl_space_get_ctx(space);
-	n = isl_space_dim(space, isl_dim_set);
-	mv = isl_multi_val_zero(space);
-	v = isl_val_int_from_si(ctx, val);
-	for (i = 0; i < n; ++i)
-		mv = isl_multi_val_set_val(mv, i, isl_val_copy(v));
-	isl_val_free(v);
-
-	return mv;
-}
-
-/* Construct an isl_multi_val living in "space" with values specified
- * by "list".  "list" is assumed to have at least as many entries
- * as the set dimension of "space".
- */
-__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
-	__isl_take isl_space *space, int *list)
-{
-	int i, n;
-	isl_ctx *ctx;
-	isl_multi_val *mv;
-
-	if (!space)
-		return NULL;
-
-	ctx = isl_space_get_ctx(space);
-	n = isl_space_dim(space, isl_dim_set);
-	mv = isl_multi_val_zero(space);
-	for (i = 0; i < n; ++i) {
-		isl_val *v;
-
-		v = isl_val_int_from_si(ctx, list[i]);
-		mv = isl_multi_val_set_val(mv, i, v);
-	}
-
-	return mv;
-}
-
-/* Compute the size of a bounding box around the origin and "set",
- * where "set" is assumed to contain only non-negative elements.
- * In particular, compute the maximal value of "set" in each direction
- * and add one.
- */
-__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set)
-{
-	int i, n;
-	isl_multi_pw_aff *mpa;
-
-	n = isl_set_dim(set, isl_dim_set);
-	mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
-	for (i = 0; i < n; ++i) {
-		isl_space *space;
-		isl_aff *one;
-		isl_pw_aff *bound;
-
-		if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) {
-			const char *name;
-			name = isl_set_get_tuple_name(set);
-			if (!name)
-				name = "";
-			fprintf(stderr, "unable to determine extent of '%s' "
-				"in dimension %d\n", name, i);
-			set = isl_set_free(set);
-		}
-		bound = isl_set_dim_max(isl_set_copy(set), i);
-
-		space = isl_pw_aff_get_domain_space(bound);
-		one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
-		one = isl_aff_add_constant_si(one, 1);
-		bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
-		mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
-	}
-	isl_set_free(set);
-
-	return mpa;
-}
diff --git a/polly/lib/External/ppcg/version.c b/polly/lib/External/ppcg/version.c
deleted file mode 100644
--- a/polly/lib/External/ppcg/version.c
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "gitversion.h"
-
-const char *ppcg_version(void)
-{
-	return GIT_HEAD_ID"\n";
-}
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -217,14 +217,6 @@
 void initializePollyPasses(llvm::PassRegistry &Registry) {
   initializeCodeGenerationPass(Registry);
 
-#ifdef GPU_CODEGEN
-  initializePPCGCodeGenerationPass(Registry);
-  initializeManagedMemoryRewritePassPass(Registry);
-  LLVMInitializeNVPTXTarget();
-  LLVMInitializeNVPTXTargetInfo();
-  LLVMInitializeNVPTXTargetMC();
-  LLVMInitializeNVPTXAsmPrinter();
-#endif
   initializeCodePreparationPass(Registry);
   initializeDeadCodeElimWrapperPassPass(Registry);
   initializeDependenceInfoPass(Registry);
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -711,11 +711,6 @@
     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
     isl::schedule &LastSchedule, bool &DepsChanged) {
-
-  // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
-  if (S.isToBeSkipped())
-    return;
-
   // Skip empty SCoPs but still allow code generation as it will delete the
   // loops present but not needed.
   if (S.getSize() == 0) {
diff --git a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll b/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
deleted file mode 100644
--- a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-define float @__nv_expf(float %a) {
-  ret float %a
-}
-define float @__nv_cosf(float %a) {
-  ret float %a
-}
-define float @__nv_logf(float %a) {
-  ret float %a
-}
diff --git a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
deleted file mode 100644
--- a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop.
-; SCOP:       Function: checkScalarKill
-; SCOP-NEXT: Region: %XLoopInit---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; Check that we have a scalar that is not a phi node in the scop.
-; SCOP: i32 MemRef_x_0; // Element size 4
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; Check that we add variables that are local to a scop into the kills that we
-; pass to PPCG. This should enable PPCG to codegen this example.
-; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
-; int x;
-; #pragma scop
-;     for(int i = 0; i < 1000; i++) {
-; XLoopInit:        x = 0;
-;
-;         if (control1 > 2)
-;             C1Add: x += 10;
-;         if (control2 > 3)
-;             C2Add: x += A[i];
-;
-; BLoopAccumX:        B[i] += x;
-;     }
-;
-; #pragma endscop
-; }
-; ModuleID = 'test.ll'
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %XLoopInit
-
-XLoopInit:                                        ; preds = %entry.split, %BLoopAccumX
-  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
-  %cmp1 = icmp sgt i32 %control1, 2
-  %x.0 = select i1 %cmp1, i32 10, i32 0
-  %cmp2 = icmp sgt i32 %control2, 3
-  br i1 %cmp2, label %C2Add, label %BLoopAccumX
-
-C2Add:                                            ; preds = %XLoopInit
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %tmp6 = load i32, ptr %arrayidx, align 4
-  %add4 = add nsw i32 %tmp6, %x.0
-  br label %BLoopAccumX
-
-BLoopAccumX:                                      ; preds = %XLoopInit, %C2Add
-  %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
-  %arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
-  %tmp11 = load i32, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %tmp11, %x.1
-  store i32 %add8, ptr %arrayidx7, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %XLoopInit, label %for.end
-
-for.end:                                          ; preds = %BLoopAccumX
-  ret void
-}
diff --git a/polly/test/GPGPU/align-params-in-schedule.ll b/polly/test/GPGPU/align-params-in-schedule.ll
deleted file mode 100644
--- a/polly/test/GPGPU/align-params-in-schedule.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-; CHECK: polly_launchKernel
-
-; Verify that this program compiles. At some point, this compilation crashed
-; due to insufficient parameters being available.
-
-source_filename = "bugpoint-output-4d01492.bc"
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] }
-%struct.widget = type { i64, i64, i64 }
-
-@global = external unnamed_addr global %struct.barney, align 32
-
-; Function Attrs: nounwind uwtable
-define void @wobble(ptr noalias %arg) #0 {
-bb:
-  %tmp = load i32, ptr %arg, align 4
-  br label %bb1
-
-bb1:                                              ; preds = %bb13, %bb
-  %tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ]
-  br label %bb3
-
-bb3:                                              ; preds = %bb3, %bb1
-  %tmp4 = load ptr, ptr @global, align 32
-  %tmp5 = sext i32 %tmp2 to i64
-  %tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8
-  %tmp7 = mul i64 %tmp6, %tmp5
-  %tmp8 = add i64 %tmp7, 0
-  %tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8
-  %tmp10 = add i64 %tmp8, %tmp9
-  %tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10
-  store i32 undef, ptr %tmp11, align 4
-  %tmp12 = icmp eq i32 0, 0
-  br i1 %tmp12, label %bb13, label %bb3
-
-bb13:                                             ; preds = %bb3
-  %tmp14 = icmp eq i32 %tmp2, %tmp
-  %tmp15 = add i32 %tmp2, 1
-  br i1 %tmp14, label %bb16, label %bb1
-
-bb16:                                             ; preds = %bb13
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll b/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
deleted file mode 100644
--- a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-use-llvm-names < %s
-; ModuleID = 'test/GPGPU/zero-size-array.ll'
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-
-; We used to divide the element size by 8 to arrive at the 'actual' size
-; of an array element. This used to cause arrays that have an element size
-; of less than 8 to collapse to size 0. This test makes sure that it does
-; not happen anymore.
-
-; f(int *niters_ptr, int *arr[0]) {
-;     const int inters = *niters_ptr;
-;     for(int i = 0; i < niters; i++) {
-;       arr[0][i + 1] = 0
-;     }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 {
-entry:
-  %niters = load i32, ptr %niters.ptr, align 4
-  br label %loop.body
-
-loop.body:                                             ; preds = %loop.body, %entry
-  %indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ]
-  %indvar.sext = sext i32 %indvar to i64
-  %arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext
-  store i32 0, ptr %arr.slot, align 4
-  %tmp8 = icmp eq i32 %indvar, %niters
-  %indvar.next = add i32 %indvar, 1
-  br i1 %tmp8, label %loop.exit, label %loop.body
-
-loop.exit:                                    ; preds = %loop.body
-  %tmp10 = icmp sgt i32 undef, 0
-  br label %auxiliary.loop
-
-auxiliary.loop:                                            ; preds = %"101", %loop.exit
-  %tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ]
-  br i1 undef, label %auxiliary.loop, label %exit
-
-exit:                              ; preds = %auxiliary.loop
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll b/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
deleted file mode 100644
--- a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-ignore-parameter-bounds \
-; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR
-;
-; REQUIRES: pollyacc
-
-; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain
-; all the parameters present in the program.
-;
-; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff`
-; to have the same parameter dimensions. To achieve this, we used to realign
-; every `pw_aff` with `Scop::Context`. However, in conjunction with
-; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context`
-; does not contain all parameters.
-;
-; We check that Polly does the right thing in this case and sets up the parameter
-; dimensions correctly.
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
-
-; C pseudocode
-; ------------
-; void f(int *arr, long niters, long stride) {
-;     for(int i = 0; i < niters; i++) {
-;       arr[i * stride] = 1;
-;     }
-; }
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 {
-entry:
-  br label %loop
-
-loop:                                             ; preds = %loop, %entry
-  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
-  %idx = mul nuw nsw i64 %indvar, %stride
-  %slot = getelementptr i32, ptr %arr, i64 %idx
-  store i32 1, ptr %slot, align 4
-  %indvar.next = add nuw nsw i64 %indvar, 1
-  %check = icmp sgt i64 %indvar.next, %niters
-  br i1 %check, label %exit, label %loop
-
-exit:                                             ; preds = %loop
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll
deleted file mode 100644
--- a/polly/test/GPGPU/cuda-annotations.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 {
-
-; KERNEL: !nvvm.annotations = !{!0}
-
-; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, i64 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb8
-
-bb2:                                              ; preds = %bb1
-  %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp4 = load i64, ptr %tmp3, align 8
-  %tmp5 = add nsw i64 %tmp4, 100
-  store i64 %tmp5, ptr %tmp3, align 8
-  br label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb8:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll
deleted file mode 100644
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-;
-;    #include <cuda_runtime.h>
-;
-;    static const int N = 45;
-;
-;    void copy(int *R, int *A) {
-;      for (int i = 0; i < N; i++) {
-;        R[i] = A[i] * 10;
-;      }
-;    }
-;
-;    int main() {
-;      int *A, *R;
-;
-;      cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
-;      cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
-;
-;      for (int i = 0; i < N; i++) {
-;        A[i] = i;
-;        R[i] = 0;
-;      }
-;      copy(R, A);
-;
-;      return 0;
-;    }
-;
-
-; CHECK-NOT: polly_copyFromHostToDevice
-; CHECK-NOT: polly_copyFromDeviceToHost
-; CHECK-NOT: polly_freeDeviceMemory
-; CHECK-NOT: polly_allocateMemoryForDevice
-
-; CHECK:       %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
-; CHECK-NEXT:  %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
-; CHECK-NEXT:  %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
-; CHECK-NEXT:  %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
-; CHECK-NEXT:  store i8* %[[REGCA]], i8** %polly_launch_0_param_0
-; CHECK-NEXT:  %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
-; CHECK-NEXT:  store i8* %[[REGCP0]], i8** %[[REGGEP0]]
-; CHECK-NEXT:  %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; CHECK-NEXT:  store i8* %[[REGCR]], i8** %polly_launch_0_param_1
-; CHECK-NEXT:  %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
-; CHECK-NEXT:  store i8* %[[REGCP1]], i8** %[[REGGEP1]]
-; CHECK-NEXT:  %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
-; CHECK-NEXT:  call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
-; CHECK-NEXT:  call void @polly_freeKernel(i8* %[[REGKERNEL]])
-; CHECK-NEXT:  call void @polly_synchronizeDevice()
-; CHECK-NEXT:  call void @polly_freeContext(i8* %[[REGCTX]])
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @copy(i32* %R, i32* %A) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  %exitcond = icmp ne i64 %indvars.iv, 45
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %tmp = load i32, i32* %arrayidx, align 4
-  %mul = mul nsw i32 %tmp, 10
-  %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
-  store i32 %mul, i32* %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-
-define i32 @main() {
-entry:
-  %A = alloca i32*, align 8
-  %R = alloca i32*, align 8
-  %tmp = bitcast i32** %A to i8**
-  %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
-  %tmp1 = bitcast i32** %R to i8**
-  %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  %exitcond = icmp ne i64 %indvars.iv, 45
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %tmp2 = load i32*, i32** %A, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
-  %tmp3 = trunc i64 %indvars.iv to i32
-  store i32 %tmp3, i32* %arrayidx, align 4
-  %tmp4 = load i32*, i32** %R, align 8
-  %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
-  store i32 0, i32* %arrayidx3, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %tmp5 = load i32*, i32** %R, align 8
-  %tmp6 = load i32*, i32** %A, align 8
-  call void @copy(i32* %tmp5, i32* %tmp6)
-  ret i32 0
-}
-
-declare i32 @cudaMallocManaged(i8**, i64, i32) #1
diff --git a/polly/test/GPGPU/debug-metadata-leak.ll b/polly/test/GPGPU/debug-metadata-leak.ll
deleted file mode 100644
--- a/polly/test/GPGPU/debug-metadata-leak.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: | FileCheck --check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 {
-
-; The instruction marked <<<LeakyInst>>> is copied into the GPUModule,
-; with changes only to the parameters to access data on the device instead of
-; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the
-; instruction is annotated with a DILocation, copying the instruction also copies
-; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by
-; failing the verification of the Module in GPUNodeBuilder::finalize, due to the
-; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied
-; nor created.
-;
-; https://reviews.llvm.org/D35630 removes this debug metadata before the
-; instruction is copied to the GPUModule.
-;
-; vec_add_1.c:
-;      void vec_add_1(int N, int arr[N]) {
-;        int i=0;
-;        for( i=0 ; i<N ; i++) arr[i] += 1;
-;      }
-;
-source_filename = "vec_add_1.c"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @vec_add_1(i32 %N, ptr %arr) !dbg !7 {
-entry:
-  call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17
-  call void @llvm.dbg.value(metadata ptr %arr, i64 0, metadata !14, metadata !16), !dbg !18
-  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
-  %tmp = sext i32 %N to i64, !dbg !20
-  br label %for.cond, !dbg !20
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19
-  %cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22
-  br i1 %cmp, label %for.body, label %for.end, !dbg !24
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv, !dbg !25
-  %tmp1 = load i32, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
-  %add = add nsw i32 %tmp1, 1, !dbg !26    ;   <<<LeakyInst>>>
-  store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
-  br label %for.inc, !dbg !25
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31
-  call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19
-  br label %for.cond, !dbg !32, !llvm.loop !33
-
-for.end:                                          ; preds = %for.cond
-  ret void, !dbg !35
-}
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata)
-
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
-
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 5.0.0"}
-!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null, !10, !11}
-!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
-!12 = !{!13, !14, !15}
-!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10)
-!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11)
-!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10)
-!16 = !DIExpression()
-!17 = !DILocation(line: 1, column: 20, scope: !7)
-!18 = !DILocation(line: 1, column: 27, scope: !7)
-!19 = !DILocation(line: 2, column: 7, scope: !7)
-!20 = !DILocation(line: 3, column: 8, scope: !21)
-!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
-!22 = !DILocation(line: 3, column: 15, scope: !23)
-!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3)
-!24 = !DILocation(line: 3, column: 3, scope: !21)
-!25 = !DILocation(line: 3, column: 25, scope: !23)
-!26 = !DILocation(line: 3, column: 32, scope: !23)
-!27 = !{!28, !28, i64 0}
-!28 = !{!"int", !29, i64 0}
-!29 = !{!"omnipotent char", !30, i64 0}
-!30 = !{!"Simple C/C++ TBAA"}
-!31 = !DILocation(line: 3, column: 21, scope: !23)
-!32 = !DILocation(line: 3, column: 3, scope: !23)
-!33 = distinct !{!33, !24, !34}
-!34 = !DILocation(line: 3, column: 35, scope: !21)
-!35 = !DILocation(line: 4, column: 1, scope: !7)
diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll
deleted file mode 100644
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ /dev/null
@@ -1,254 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=SCHED %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-ASM
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today due to extensive output differences from when the test was written.
-
-; CHECK: Stmt_bb5
-; CHECK-NEXT:       Domain :=
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
-; CHECK-NEXT:       Schedule :=
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> [i0, i1] };
-; CHECK-NEXT:       ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
-; CHECK-NEXT:       MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
-
-; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
-; SCHED-NEXT: child:
-; SCHED-NEXT:   context: "{ [] }"
-; SCHED-NEXT:   child:
-; SCHED-NEXT:     extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }"
-; SCHED-NEXT:     child:
-; SCHED-NEXT:       sequence:
-; SCHED-NEXT:       - filter: "{ to_device_MemRef_A[] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           set:
-; SCHED-NEXT:           - filter: "{ to_device_MemRef_A[] }"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               guard: "{ [] }"
-; SCHED-NEXT:       - filter: "{ Stmt_bb5[i0, i1] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           guard: "{ [] }"
-; SCHED-NEXT:           child:
-; SCHED-NEXT:             mark: "kernel"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED-NEXT:               child:
-; SCHED-NEXT:                 filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
-; SCHED-NEXT:                 child:
-; SCHED-NEXT:                   schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
-; SCHED-NEXT:                   permutable: 1
-; SCHED-NEXT:                   coincident: [ 1, 1 ]
-; SCHED-NEXT:                   child:
-; SCHED-NEXT:                     filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED-NEXT:                     child:
-; SCHED-NEXT:                       schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
-; SCHED-NEXT:                       permutable: 1
-; SCHED-NEXT:                       coincident: [ 1, 1 ]
-; SCHED-NEXT:       - filter: "{ from_device_MemRef_A[] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           set:
-; SCHED-NEXT:           - filter: "{ from_device_MemRef_A[] }"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               guard: "{ [] }"
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(16, 32);
-; CODE-NEXT:     dim3 k0_dimGrid(32, 32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
-; CODE-NEXT:   Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
-
-; IR: polly.split_new_and_old:
-; IR-NEXT:   %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
-; IR-NEXT:   %.obit = extractvalue { i64, i1 } %0, 1
-; IR-NEXT:   %polly.overflow.state = or i1 false, %.obit
-; IR-NEXT:   %.res = extractvalue { i64, i1 } %0, 0
-; IR-NEXT:   %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
-; IR-NEXT:   %.obit1 = extractvalue { i64, i1 } %1, 1
-; IR-NEXT:   %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
-; IR-NEXT:   %.res3 = extractvalue { i64, i1 } %1, 0
-; IR-NEXT:   %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
-; IR-NEXT:   %.obit4 = extractvalue { i64, i1 } %2, 1
-; IR-NEXT:   %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
-; IR-NEXT:   %.res6 = extractvalue { i64, i1 } %2, 0
-; IR-NEXT:   %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
-; IR-NEXT:   %.obit7 = extractvalue { i64, i1 } %3, 1
-; IR-NEXT:   %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
-; IR-NEXT:   %.res9 = extractvalue { i64, i1 } %3, 0
-; IR-NEXT:   %4 = icmp sge i64 %.res9, 2621440
-; IR-NEXT:   %5 = and i1 true, %4
-; IR-NEXT:   %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
-; IR-NEXT:   %polly.rtc.result = and i1 %5, %polly.rtc.overflown
-; IR-NEXT:   br i1 %polly.rtc.result, label %polly.start, label %bb2
-
-; IR: polly.start:
-; IR-NEXT: br label %polly.acc.initialize
-
-; IR: polly.acc.initialize:
-; IR-NEXT:    [[GPUContext:%.*]] = call ptr @polly_initContext()
-; IR-NEXT:    %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304)
-; IR-NEXT:    call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304)
-; IR-NEXT:    [[DevPtr:%.*]]  = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A)
-; IR-NEXT:    store ptr [[DevPtr]], ptr %polly_launch_0_param_0
-; IR-NEXT:    store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params
-; IR-NEXT:    call ptr @polly_getKernel
-; IR-NEXT:    call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr)
-; IR-NEXT:    call void @polly_freeKernel
-; IR-NEXT:    call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304)
-; IR-NEXT:    call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A)
-; IR-NEXT:    call void @polly_freeContext(ptr [[GPUContext]])
-; IR-NEXT:    br label %polly.exiting
-
-; IR: polly.exiting:
-; IR-NEXT:    br label %polly.merge_new_and_old
-
-; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 {
-; KERNEL-IR-NEXT: entry:
-; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
-; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-; KERNEL-IR-NEXT:   %b1 = zext i32 %1 to i64
-; KERNEL-IR-NEXT:   %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-IR-NEXT:   %t0 = zext i32 %2 to i64
-; KERNEL-IR-NEXT:   %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-; KERNEL-IR-NEXT:   %t1 = zext i32 %3 to i64
-; KERNEL-IR-NEXT:   br label %polly.loop_preheader
-
-; KERNEL-IR-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
-; KERNEL-IR-NEXT:   ret void
-
-; KERNEL-IR-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
-; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
-; KERNEL-IR-NEXT:   %4 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %5 = add nsw i64 %4, %t0
-; KERNEL-IR-NEXT:   %6 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %7 = add nsw i64 %6, %t1
-; KERNEL-IR-NEXT:   %8 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %9 = add nsw i64 %7, %8
-; KERNEL-IR-NEXT:   br label %polly.stmt.bb5
-
-; KERNEL-IR-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
-; KERNEL-IR-NEXT:   %10 = mul i64 %5, %9
-; KERNEL-IR-NEXT:   %p_tmp6 = sitofp i64 %10 to float
-; KERNEL-IR-NEXT:   %11 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, %t0
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
-; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t1
-; KERNEL-IR-NEXT:   %15 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %16 = add nsw i64 %14, %15
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A
-; KERNEL-IR-NEXT:   %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4
-; KERNEL-IR-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
-; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
-; KERNEL-IR-NEXT:   %19 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %20 = add nsw i64 %19, %t1
-; KERNEL-IR-NEXT:   %21 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %22 = add nsw i64 %20, %21
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3
-; KERNEL-IR-NEXT:   store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4
-; KERNEL-IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; KERNEL-IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 0
-; KERNEL-IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; KERNEL-IR-LABEL: polly.loop_preheader:                             ; preds = %entry
-; KERNEL-IR-NEXT:   br label %polly.loop_header
-
-; KERNEL-IR: attributes #0 = { "polly.skip.fn" }
-
-; KERNEL-ASM: .version 3.2
-; KERNEL-ASM-NEXT: .target sm_30
-; KERNEL-ASM-NEXT: .address_size 64
-
-; KERNEL-ASM:   // .globl     kernel_0
-
-; KERNEL-ASM: .visible .entry kernel_0(
-; KERNEL-ASM-NEXT:   .param .u64 kernel_0_param_0
-; KERNEL-ASM-NEXT: )
-
-;    void double_parallel_loop(float A[][1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        for (long j = 0; j < 1024; j++)
-;          A[i][j] += i * j;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @double_parallel_loop(ptr %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb13, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
-  %exitcond1 = icmp ne i64 %i.0, 1024
-  br i1 %exitcond1, label %bb3, label %bb15
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb10, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %j.0, 1024
-  br i1 %exitcond, label %bb5, label %bb12
-
-bb5:                                              ; preds = %bb4
-  %tmp = mul nuw nsw i64 %i.0, %j.0
-  %tmp6 = sitofp i64 %tmp to float
-  %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0
-  %tmp8 = load float, ptr %tmp7, align 4
-  %tmp9 = fadd float %tmp8, %tmp6
-  store float %tmp9, ptr %tmp7, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb5
-  %tmp11 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12
-  %tmp14 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb15:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/failing-invariant-load-handling.ll b/polly/test/GPGPU/failing-invariant-load-handling.ll
deleted file mode 100644
--- a/polly/test/GPGPU/failing-invariant-load-handling.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
-
-%S = type { i32, i32, [12 x %L] }
-%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
-
-define void @test(ptr %cpi, i1 %b) {
-; SCOPS-LABEL: Region: %if.then14---%exit
-; SCOPS:         Invariant Accesses: {
-; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] };
-; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  :  }
-; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] };
-; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  : l2 > 0 }
-; SCOPS-NEXT:    }
-; SCOPS:         Arrays {
-; SCOPS-NEXT:        i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4
-; SCOPS-NEXT:    }
-
-; Check that we gracefully handle failing invariant loads.
-; This test case is taken from:
-; test/Isl/CodeGen/invariant-load-dimension.ll
-
-; FIXME: Figure out how to actually generate code for this loop.
-; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function
-
-entry:
-  %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
-  br i1 %b, label %if.then14, label %exit
-
-if.then14:
-  %l0 = load i32, ptr %cpi, align 8
-  %cmp12.i = icmp sgt i32 %l0, 0
-  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
-
-for.body.lr.ph.i:
-  %l1 = load i32, ptr %nt, align 4
-  br label %for.body.i
-
-for.body.i:
-  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
-  %mul.i163 = mul nsw i32 %phi, %l1
-  %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
-  store i32 0, ptr %cv, align 8
-  %inc = add nuw nsw i32 %phi, 1
-  %l2 = load i32, ptr %cpi, align 8
-  %cmp.i164 = icmp slt i32 %inc, %l2
-  br i1 %cmp.i164, label %for.body.i, label %exit
-
-exit:
-  ret void
-}
diff --git a/polly/test/GPGPU/failing-invariant-load-hoisting.ll b/polly/test/GPGPU/failing-invariant-load-hoisting.ll
deleted file mode 100644
--- a/polly/test/GPGPU/failing-invariant-load-hoisting.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
-
-%S = type { i32, i32, [12 x %L] }
-%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
-
-define void @test(ptr %cpi, i1 %b) {
-; CODEGEN-LABEL: @test(
-; CODEGEN:    polly.preload.begin:
-; CODEGEN-NEXT:  br i1 false
-
-entry:
-  %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
-  br i1 %b, label %if.then14, label %exit
-
-if.then14:
-  %l0 = load i32, ptr %cpi, align 8
-  %cmp12.i = icmp sgt i32 %l0, 0
-  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
-
-for.body.lr.ph.i:
-  %l1 = load i32, ptr %nt, align 4
-  br label %for.body.i
-
-for.body.i:
-  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
-  %mul.i163 = mul nsw i32 %phi, %l1
-  %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
-  store i32 0, ptr %cv, align 8
-  %inc = add nuw nsw i32 %phi, 1
-  %l2 = load i32, ptr %cpi, align 8
-  %cmp.i164 = icmp slt i32 %inc, %l2
-  br i1 %cmp.i164, label %for.body.i, label %exit
-
-exit:
-  ret void
-}
diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll
deleted file mode 100644
--- a/polly/test/GPGPU/host-control-flow.ll
+++ /dev/null
@@ -1,176 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
-; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
-; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | FileCheck %s -check-prefix=IR
-;    void foo(float A[2][100]) {
-;      for (long t = 0; t < 100; t++)
-;        for (long i = 1; i < 99; i++)
-;          A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
-;    }
-
-; REQUIRES: pollyacc
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   for (int c0 = 0; c0 <= 99; c0 += 1)
-; CODE-NEXT:     {
-; CODE-NEXT:       dim3 k0_dimBlock(32);
-; CODE-NEXT:       dim3 k0_dimGrid(4);
-; CODE-NEXT:       kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT: }
-
-; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
-; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
-; ...
-; IR:  store i64 %polly.indvar, i64* %polly_launch_0_param_1
-; IR-NEXT:  [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; IR-NEXT:  [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
-; IR-NEXT:  store i8* [[REGB]], i8** [[REGA]]
-; IR: call i8* @polly_getKernel
-; ...
-; IR: call void @polly_freeKernel
-; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
-; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
-; KERNEL-IR-LABEL: entry:
-; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
-; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-IR-NEXT:   %t0 = zext i32 %1 to i64
-; KERNEL-IR-NEXT:   br label %polly.cond
-
-; KERNEL-IR-LABEL: polly.cond:                                       ; preds = %entry
-; KERNEL-IR-NEXT:   %2 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %3 = add nsw i64 %2, %t0
-; KERNEL-IR-NEXT:   %4 = icmp sle i64 %3, 97
-; KERNEL-IR-NEXT:   br i1 %4, label %polly.then, label %polly.else
-
-; KERNEL-IR-LABEL: polly.merge:                                      ; preds = %polly.else, %polly.stmt.for.body3
-; KERNEL-IR-NEXT:   ret void
-
-; KERNEL-IR-LABEL: polly.then:                                       ; preds = %polly.cond
-; KERNEL-IR-NEXT:   %5 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %6 = add nsw i64 %5, %t0
-; KERNEL-IR-NEXT:   br label %polly.stmt.for.body3
-
-; KERNEL-IR-LABEL: polly.stmt.for.body3:                             ; preds = %polly.then
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
-; KERNEL-IR-NEXT:   %7 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %8 = add nsw i64 %7, %t0
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
-; KERNEL-IR-NEXT:   %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r2 = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
-; KERNEL-IR-NEXT:   %9 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %10 = add nsw i64 %9, %t0
-; KERNEL-IR-NEXT:   %11 = add nsw i64 %10, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
-; KERNEL-IR-NEXT:   %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4
-; KERNEL-IR-NEXT:   %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r7 = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
-; KERNEL-IR-NEXT:   %12 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %13 = add nsw i64 %12, %t0
-; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, 2
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
-; KERNEL-IR-NEXT:   %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4
-; KERNEL-IR-NEXT:   %p_add12 = fadd float %p_add, %tmp3_p_scalar_
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %15 = add nsw i64 %c0, 1
-; KERNEL-IR-NEXT:   %pexp.pdiv_r12 = urem i64 %15, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
-; KERNEL-IR-NEXT:   %16 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %17 = add nsw i64 %16, %t0
-; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
-; KERNEL-IR-NEXT:   %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4
-; KERNEL-IR-NEXT:   %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %19 = add nsw i64 %c0, 1
-; KERNEL-IR-NEXT:   %pexp.pdiv_r17 = urem i64 %19, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
-; KERNEL-IR-NEXT:   %20 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %21 = add nsw i64 %20, %t0
-; KERNEL-IR-NEXT:   %22 = add nsw i64 %21, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
-; KERNEL-IR-NEXT:   store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4
-; KERNEL-IR-NEXT:   br label %polly.merge
-
-; KERNEL-IR-LABEL: polly.else:                                       ; preds = %polly.cond
-; KERNEL-IR-NEXT:   br label %polly.merge
-; KERNEL-IR-NEXT: }
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo([100 x float]* %A) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc18, %entry
-  %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
-  %exitcond1 = icmp ne i64 %t.0, 100
-  br i1 %exitcond1, label %for.body, label %for.end20
-
-for.body:                                         ; preds = %for.cond
-  br label %for.cond1
-
-for.cond1:                                        ; preds = %for.inc, %for.body
-  %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
-  %exitcond = icmp ne i64 %i.0, 99
-  br i1 %exitcond, label %for.body3, label %for.end
-
-for.body3:                                        ; preds = %for.cond1
-  %sub = add nsw i64 %i.0, -1
-  %rem = srem i64 %t.0, 2
-  %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
-  %tmp = load float, float* %arrayidx4, align 4
-  %rem5 = srem i64 %t.0, 2
-  %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
-  %tmp2 = load float, float* %arrayidx7, align 4
-  %add = fadd float %tmp, %tmp2
-  %add8 = add nuw nsw i64 %i.0, 1
-  %rem9 = srem i64 %t.0, 2
-  %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
-  %tmp3 = load float, float* %arrayidx11, align 4
-  %add12 = fadd float %add, %tmp3
-  %add13 = add nuw nsw i64 %t.0, 1
-  %rem14 = srem i64 %add13, 2
-  %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
-  %tmp4 = load float, float* %arrayidx16, align 4
-  %add17 = fadd float %tmp4, %add12
-  store float %add17, float* %arrayidx16, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body3
-  %inc = add nuw nsw i64 %i.0, 1
-  br label %for.cond1
-
-for.end:                                          ; preds = %for.cond1
-  br label %for.inc18
-
-for.inc18:                                        ; preds = %for.end
-  %inc19 = add nuw nsw i64 %t.0, 1
-  br label %for.cond
-
-for.end20:                                        ; preds = %for.cond
-  ret void
-}
diff --git a/polly/test/GPGPU/host-statement.ll b/polly/test/GPGPU/host-statement.ll
deleted file mode 100644
--- a/polly/test/GPGPU/host-statement.ll
+++ /dev/null
@@ -1,204 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-invariant-load-hoisting=false \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -polly-invariant-load-hoisting=false \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.lifetime.start(i64, ptr nocapture) #0
-
-; This test case tests that we can correctly handle a ScopStmt that is
-; scheduled on the host, instead of within a kernel.
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(16);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   if (p_0 <= 510 && p_1 <= 510) {
-; CODE-NEXT:     {
-; CODE-NEXT:       dim3 k1_dimBlock(32);
-; CODE-NEXT:       dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
-; CODE-NEXT:       kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:     {
-; CODE-NEXT:       dim3 k2_dimBlock(16, 32);
-; CODE-NEXT:       dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
-; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:   }
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:     Stmt_for_cond33_preheader_last();
-
-; CODE: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
-
-; CODE: # kernel1
-; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
-; CODE-NEXT:   for (int c1 = 0; c1 <= 15; c1 += 1) {
-; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0)
-; CODE-NEXT:       Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510)
-; CODE-NEXT:       for (int c3 = 0; c3 <= 31; c3 += 1)
-; CODE-NEXT:         Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
-; CODE-NEXT:     sync0();
-; CODE-NEXT:   }
-
-; CODE: # kernel2
-; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
-; CODE-NEXT:   if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
-; CODE-NEXT:     for (int c3 = 0; c3 <= 1; c3 += 1)
-; CODE-NEXT:       Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
-
-; KERNEL-IR: call void @llvm.nvvm.barrier0()
-
-; Function Attrs: nounwind uwtable
-define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %entry.split, %for.inc86
-  %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
-  %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.cond1.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
-  %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
-  %arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
-  %tmp = load double, ptr %arrayidx5, align 8, !tbaa !1
-  %arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
-  %tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1
-  %mul = fmul double %tmp, %tmp27
-  %add = fadd double %nrm.02, %mul
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 512
-  br i1 %exitcond, label %for.inc, label %for.end
-
-for.end:                                          ; preds = %for.inc
-  %add.lcssa = phi double [ %add, %for.inc ]
-  %call = tail call double @sqrt(double %add.lcssa) #2
-  %arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
-  store double %call, ptr %arrayidx13, align 8, !tbaa !1
-  br label %for.body16
-
-for.cond33.preheader:                             ; preds = %for.body16
-  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
-  %cmp347 = icmp slt i64 %indvars.iv.next25, 512
-  br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
-
-for.body35.lr.ph:                                 ; preds = %for.cond33.preheader
-  br label %for.body35
-
-for.body16:                                       ; preds = %for.end, %for.body16
-  %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
-  %arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24
-  %tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1
-  %arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
-  %tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1
-  %div = fdiv double %tmp28, %tmp29
-  %arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24
-  store double %div, ptr %arrayidx28, align 8, !tbaa !1
-  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
-  %exitcond12 = icmp ne i64 %indvars.iv.next11, 512
-  br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
-
-for.cond33.loopexit:                              ; preds = %for.body62
-  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
-  %exitcond23 = icmp ne i32 %lftr.wideiv, 512
-  br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
-
-for.body35:                                       ; preds = %for.body35.lr.ph, %for.cond33.loopexit
-  %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
-  %arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1
-  br label %for.body42
-
-for.cond60.preheader:                             ; preds = %for.body42
-  br label %for.body62
-
-for.body42:                                       ; preds = %for.body35, %for.body42
-  %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
-  %arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24
-  %tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1
-  %arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21
-  %tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1
-  %mul51 = fmul double %tmp30, %tmp31
-  %arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  %tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1
-  %add56 = fadd double %tmp32, %mul51
-  store double %add56, ptr %arrayidx55, align 8, !tbaa !1
-  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
-  %exitcond15 = icmp ne i64 %indvars.iv.next14, 512
-  br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
-
-for.body62:                                       ; preds = %for.cond60.preheader, %for.body62
-  %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
-  %arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
-  %tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1
-  %arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24
-  %tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1
-  %arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  %tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1
-  %mul75 = fmul double %tmp34, %tmp35
-  %sub = fsub double %tmp33, %mul75
-  %arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
-  store double %sub, ptr %arrayidx79, align 8, !tbaa !1
-  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
-  %exitcond18 = icmp ne i64 %indvars.iv.next17, 512
-  br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
-
-for.cond33.for.inc86_crit_edge:                   ; preds = %for.cond33.loopexit
-  br label %for.inc86
-
-for.inc86:                                        ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond26 = icmp ne i64 %indvars.iv.next25, 512
-  br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
-
-for.end88:                                        ; preds = %for.inc86
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #0
-
-; Function Attrs: nounwind
-declare double @sqrt(double) #2
-
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"double", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/ignore-parameter-bounds.ll b/polly/test/GPGPU/ignore-parameter-bounds.ll
deleted file mode 100644
--- a/polly/test/GPGPU/ignore-parameter-bounds.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-; CODE: Code
-; CODE: ====
-; CODE: No code generated
-
-source_filename = "bugpoint-output-83bcdeb.bc"
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-@__data_radiation_MOD_cobi = external global [168 x double], align 32
-
-; Function Attrs: nounwind uwtable
-define void @__radiation_rg_MOD_coe_so() #0 {
-entry:
-  %polly.access.kspec.load = load i32, ptr undef, align 4
-  %0 = or i1 undef, undef
-  br label %polly.preload.cond29
-
-polly.preload.cond29:                             ; preds = %entry
-  br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30
-
-polly.preload.merge30:                            ; preds = %polly.preload.exec31, %polly.preload.cond29
-  %polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ]
-  ret void
-
-polly.preload.exec31:                             ; preds = %polly.preload.cond29
-  %1 = sext i32 %polly.access.kspec.load to i64
-  %2 = mul nsw i64 7, %1
-  %3 = add nsw i64 0, %2
-  %4 = add nsw i64 %3, 48
-  %polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4
-  %polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8
-  br label %polly.preload.merge30
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
deleted file mode 100644
--- a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR
-; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg  < %s | FileCheck %s --check-prefix=HOST-IR
-
-; Test that we do recognise and codegen a kernel that has intrinsics.
-
-; REQUIRES: pollyacc
-
-; Check that we model the kernel as a scop.
-; SCOP:      Function: f
-; SCOP-NEXT:       Region: %entry.split---%for.end
-
-; Check that the intrinsic call is present in the kernel IR.
-; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
-; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
-; KERNEL-IR:   declare float @llvm.fabs.f32(float)
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
-
-
-; void f(float *A, float *B, int N) {
-;   for(int i = 0; i < N; i++) {
-;       float tmp0 = A[i];
-;       float tmp1 = sqrt(tmp1);
-;       float tmp2 = fabs(tmp2);
-;       float tmp3 = copysignf(tmp1, tmp2);
-;       B[i] = tmp4;
-;   }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(float* %A, float* %B, i32 %N) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp1 = icmp sgt i32 %N, 0
-  br i1 %cmp1, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %A.arr.i.val = load float, float* %A.arr.i, align 4
-  ; Call to intrinsics that should be part of the kernel.
-  %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
-  %fabs = tail call float @llvm.fabs.f32(float %sqrt);
-  %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
-  %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %copysign, float* %B.arr.i, align 4
-
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %wide.trip.count = zext i32 %N to i64
-  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.sqrt.f32(float) #0
-declare float @llvm.fabs.f32(float) #0
-declare float @llvm.copysign.f32(float, float) #0
-
-attributes #0 = { nounwind readnone }
-
diff --git a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg  -polly-acc-fail-on-verify-module-failure \
-; RUN: -disable-output < %s
-
-; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually
-; fail on an illegal module.
-
-; REQUIRES: pollyacc, asserts
-; XFAIL: *
-;
-;    void foo(long A[1024], long B[1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += (B[i] + (long)&B[i]);
-;    }
-
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb10, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb12
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 8
-  %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp5 = ptrtoint ptr %tmp4 to i64
-  %tmp6 = add nsw i64 %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp8 = load i64, ptr %tmp7, align 8
-  %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, ptr %tmp7, align 8
-  br label %bb10
-
-bb10:                                             ; preds = %bb2
-  %tmp11 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb12:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/invalid-kernel.ll b/polly/test/GPGPU/invalid-kernel.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invalid-kernel.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: not FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; REQUIRES: pollyacc
-;
-;    void foo(long A[1024], long B[1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += (B[i] + (long)&B[i]);
-;    }
-
-; This kernel loads/stores a pointer address we model. This is a rare case,
-; were we still lack proper code-generation support. We check here that we
-; detect the invalid IR and bail out gracefully.
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; KERNEL-IR: kernel
-
-; IR: br i1 false, label %polly.start, label %bb1
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb10, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb12
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 8
-  %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp5 = ptrtoint ptr %tmp4 to i64
-  %tmp6 = add nsw i64 %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp8 = load i64, ptr %tmp7, align 8
-  %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, ptr %tmp7, align 8
-  br label %bb10
-
-bb10:                                             ; preds = %bb2
-  %tmp11 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb12:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-array-access.ll b/polly/test/GPGPU/invariant-load-array-access.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-array-access.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop.
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] };
-; SCOP-NEXT:         Execution Context: [tmp] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] };
-; SCOP-NEXT:         Execution Context: [tmp] -> {  : tmp >= 4 }
-; SCOP-NEXT: }
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; This test makes sure that such an access pattern is handled correctly
-; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
-; was the main reason that caused this test case to crash.
-;
-; void f(int *arr, const int *control, const int *readarr) {
-;     for(int i = 0; i < 1000; i++) {
-;         int t = 0;
-;         if (*control > 3) {
-;             t += *readarr;
-;         }
-;         arr[i] = t;
-;     }
-; }
-
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-define void @f(ptr %arr, ptr %control, ptr %readarr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %if.end
-  %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ]
-  %tmp = load i32, ptr %control, align 4
-  %cmp1 = icmp sgt i32 %tmp, 3
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %tmp1 = load i32, ptr %readarr, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01
-  store i32 %t.0, ptr %arrayidx, align 4
-  %inc = add nuw nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %inc, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %if.end
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-escaping-values.ll b/polly/test/GPGPU/invariant-load-escaping-values.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-escaping-values.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s
-
-; REQUIRES: pollyacc
-
-; CHECK: store i64 %polly.access.B.load, ptr %invariant.preload.s2a
-; CHECK: %invariant.final_reload = load i64, ptr %invariant.preload.s2a
-
-; Verify that the final reload of an invariant scalar memory access uses the
-; same stack slot that into which the invariant memory access was stored
-; originally. Earlier, this was broken as we introduce a new stack slot aside
-; of the preload stack slot, which remained uninitialized and caused our escaping
-; loads to contain garbage.
-
-define i64 @foo(ptr %A, ptr %B) {
-entry:
-  br label %loop
-
-loop:
-  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
-  %indvar.next = add nsw i64 %indvar, 1
-  %idx = getelementptr float, ptr %A, i64 %indvar
-  store float 42.0, ptr %idx
-  %invariant = load i64, ptr %B
-  %cmp = icmp sle i64 %indvar, 1024
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret i64 %invariant
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll b/polly/test/GPGPU/invariant-load-hoisting-of-array.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Entry: Contains (%loaded.ptr.preload.s2a = alloca double*) which is
-;   |    invariant load hoisted `%loaded.ptr`
-;   v
-; Run-time check --(failure branch)--> { old code - contains `%loaded.ptr` }
-;   |
-;  (success branch)
-;   |
-;   v
-; New Code: Should refer to `%loaded.ptr.preload.s2a`, which is
-;           the invariant load hoisted value, NOT `%loaded.ptr`.
-
-; In Polly, we preserve the old code and create a separate branch that executes
-; the GPU code if a run-time check succeeds.
-
-; We need to make sure that in the new branch, we pick up invariant load hoisted
-; values. The old values will belong to the old code branch.
-
-; In this case, we use to try to load the 'original' %loaded.ptr in the
-; 'New Code' branch,which is wrong. Check that this does not happen.
-
-; Check that we have a Scop with an invariant load of the array.
-; SCOP:       Function: f
-; SCOP-NEXT:  Region: %arrload---%for.exit
-; SCOP-NEXT:  Max Loop Depth:  1
-; SCOP-NEXT:  Invariant Accesses: {
-; SCOP-NEXT:          ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:              { Stmt_arrload[] -> MemRef_arr_of_ptrs[0] };
-
-
-
-; Check that we have the preloaded array.
-; HOST-IR: entry:
-; HOST-IR-NEXT:  %loaded.ptr.preload.s2a = alloca double*
-
-; Chek that we store the correct value in the preload.
-; polly.preload.begin:                              ; preds = %polly.split_new_and_old
-; HOST-IR: %polly.access.arr.of.ptrs = getelementptr double*, double** %arr.of.ptrs, i64 0
-; HOST-IR-NEXT: %polly.access.arr.of.ptrs.load = load double*, double** %polly.access.arr.of.ptrs
-; HOST-IR-NEXT: store double* %polly.access.arr.of.ptrs.load, double** %loaded.ptr.preload.s2a
-
-; Check that we get back data from the kernel.
-; HOST-IR: polly.acc.initialize:                             ; preds = %polly.start
-; HOST-IR: [[FIRSTINDEX:%.+]] = getelementptr double, double* %polly.access.arr.of.ptrs.load, i64 1
-; HOST-IR: [[BITCASTED:%.+]] = bitcast double* [[FIRSTINDEX]] to i8*
-; HOST-IR: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_loaded_ptr, i8* [[BITCASTED]], i64 800)
-
-; Check that the kernel launch is generated in the host IR.
-; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
-
-
-; C pseudocode equivalent
-; void f(double **arr_of_ptrs) {
-;     double *loaded_ptr = arr_of_ptrs[0];
-;     if (false) { return; }
-;     else {
-;         for(int i = 1; i < 100; i++) {
-;             loaded_ptr[i] = 42.0;
-;         }
-;     }
-; }
-
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-
-; Function Attrs: nounwind uwtable
-define void @f(double **%arr.of.ptrs) #0 {
-entry:
-  br label %arrload
-
-arrload:                                             ; preds = %"7"
-  %loaded.ptr = load double*, double** %arr.of.ptrs, align 8
-  br i1 false, label %"for.exit", label %"for.preheader"
-
-"for.preheader":                                       ; preds = %"51"
-  br label %"for.body"
-
-"for.body":                                             ; preds = %"53", %"53.lr.ph"
-  %indvar = phi i64 [ 1, %"for.preheader" ], [ %indvar.next, %"for.body" ]
-  %slot = getelementptr double, double* %loaded.ptr, i64 %indvar
-  store double 42.0, double* %slot, align 8
-
-  %indvar.next = add nuw nsw i64 %indvar, 1
-
-  %check = icmp sgt i64 %indvar.next, 100
-  br i1 %check, label %"for.exit", label %"for.body"
-
-"for.exit":                                             ; preds = %"52.54_crit_edge", %"51"
-    ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll b/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=HOST-IR %s
-
-; RUN: opt %loadPolly -disable-output -polly-acc-dump-kernel-ir \
-; RUN: -polly-codegen-ppcg -polly-scops \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-; Verify that invariant loads used in a kernel statement are correctly forwarded
-; as subtree value to the GPU kernel.
-
-; HOST-IR: store float %polly.access.p.load, ptr %invariant.preload.s2a, align 4
-
-; KERNEL-IR:  define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2({{.*}}ptr addrspace(1) %MemRef_indvar2f__phi{{.*}})
-; KERNEL-IR:   %indvar2f.phiops.reload = load float, ptr %indvar2f.phiops, align 4
-; KERNEL-IR:   store float %indvar2f.phiops.reload, ptr addrspace(1) %polly.access.MemRef_A, align 4
-
-; FIXME: store float %indvar2f.phiops.reload, ptr %indvar2f.phiops, align 4
-; For some reason the above instruction is emitted that stores back to the addess it was just loaded from.
-
-define void @foo(ptr %A, ptr %p) {
-entry:
-  br label %loop
-
-loop:
-  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
-  %indvar.next = add i64 %indvar, 1
-  %invariant = load float, ptr %p
-  %ptr = getelementptr float, ptr %A, i64 %indvar
-  store float 42.0, ptr %ptr
-  %cmp = icmp sle i64 %indvar, 1024
-  br i1 %cmp, label %loop, label %anotherloop
-
-anotherloop:
-  %indvar2 = phi i64 [0, %loop], [%indvar2.next, %anotherloop]
-  %indvar2f = phi float [%invariant, %loop], [%indvar2f, %anotherloop]
-  %indvar2.next = add i64 %indvar2, 1
-  store float %indvar2f, ptr %A
-  %cmp2 = icmp sle i64 %indvar2, 1024
-  br i1 %cmp2, label %anotherloop, label %end
-
-end:
-  ret void
-
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp1, tmp4] -> { Stmt_entry_split[] -> MemRef_begin[0] };
-; SCOP-NEXT:         Execution Context: [tmp1, tmp4] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp1, tmp4] -> { Stmt_for_body[i0] -> MemRef_end[0] };
-; SCOP-NEXT:         Execution Context: [tmp1, tmp4] -> {  :  }
-; SCOP-NEXT: }
-
-
-; Check that the kernel launch is generated in the host IR.
-; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-;    void f(int *begin, int *end, int *arr) {
-;      for (int i = *begin; i < *end; i++) {
-;        arr[i] = 0;
-;      }
-;    }
-;
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-
-define void @f(ptr %begin, ptr %end, ptr %arr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp1 = load i32, ptr %begin, align 4
-  %tmp41 = load i32, ptr %end, align 4
-  %cmp2 = icmp slt i32 %tmp1, %tmp41
-  br i1 %cmp2, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.03 = phi i32 [ %tmp1, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.03
-  store i32 0, ptr %arrayidx, align 4
-  %inc = add nsw i32 %i.03, 1
-  %tmp4 = load i32, ptr %end, align 4
-  %cmp = icmp slt i32 %inc, %tmp4
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop with invariant accesses.
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [beginval] -> { Stmt_entry_split[] -> MemRef_begin[0] };
-; SCOP-NEXT:         Execution Context: [beginval] -> {  :  }
-; SCOP-NEXT: }
-
-; Check that the kernel launch is generated in the host IR.
-; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-;
-; void f(int *begin, int *arr) {
-;     for (int i = *begin; i < 100; i++) {
-;         arr[i] = 0;
-;     }
-; }
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-
-define void @f(ptr %begin, ptr %arr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %beginval = load i32, ptr %begin, align 4
-  %cmp1 = icmp slt i32 %beginval, 100
-  br i1 %cmp1, label %for.body, label %for.end
-
-
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %ival = phi i32 [ %beginval, %entry.split ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %ival
-  store i32 0, ptr %arrayidx, align 4
-  %inc = add nsw i32 %ival, 1
-  %cmp = icmp slt i32 %ival, 99
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop with invariant accesses.
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp2] -> { Stmt_for_body[i0] -> MemRef_idx[0] };
-; SCOP-NEXT:         Execution Context: [tmp2] -> {  :  }
-; SCOP-NEXT: }
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; Check if we generate GPU code for simple loop with variable upper bound.
-; This always worked, but have this test to prevent regressions.
-;    void f(int *idx, int *arr) {
-;      for (int i = 0; i < *idx; i++) {
-;        arr[i] = 0;
-;      }
-;    }
-;
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(ptr %idx, ptr %arr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp21 = load i32, ptr %idx, align 4
-  %cmp2 = icmp sgt i32 %tmp21, 0
-  br i1 %cmp2, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv
-  store i32 0, ptr %arrayidx, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %tmp2 = load i32, ptr %idx, align 4
-  %0 = sext i32 %tmp2 to i64
-  %cmp = icmp slt i64 %indvars.iv.next, %0
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting.ll b/polly/test/GPGPU/invariant-load-hoisting.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-hoisting.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-;
-; RUN: opt %loadPolly -polly-scops -S  -polly-invariant-load-hoisting \
-; RUN: -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-;
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=KERNEL-IR
-;
-; REQUIRES: pollyacc
-;
-; SCOP:       Function: f
-; SCOP-NEXT:  Region: %entry.split---%for.end26
-; SCOP-NEXT:  Max Loop Depth:  3
-; SCOP-NEXT:  Invariant Accesses: {
-; SCOP-NEXT:          ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:              [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] };
-; SCOP-NEXT:          Execution Context: [n, tmp12] -> {  : n > 0 }
-; SCOP-NEXT:  }
-; HOST-IR:      call void @polly_launchKernel(ptr %[[REGC:[0-9]+]], i32 %{{[0-9]+}}, i32 1, i32 32, i32 1, i32 1, ptr %polly_launch_0_params_i8ptr)
-; HOST-IR-NEXT: call void @polly_freeKernel(ptr %[[REGC]])
-
-; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_B, ptr addrspace(1) %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge)
-
-
-; Check that we generate correct GPU code in case of invariant load hoisting.
-;
-;
-;    static const int N = 3000;
-;
-;    void f(int A[N][N], int *invariant, int B[N][N], int n) {
-;      for (int i = 0; i < n; i++) {
-;        for (int j = 0; j < n; j++) {
-;          for (int k = 0; k < n; k++) {
-;
-;            A[*invariant][k] = B[k][k];
-;            A[k][*invariant] += B[k][k];
-;          }
-;        }
-;      }
-;    }
-;
-
-define void @f(ptr %A, ptr %invariant, ptr %B, i32 %n) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp6 = icmp sgt i32 %n, 0
-  br i1 %cmp6, label %for.cond1.preheader.lr.ph, label %for.end26
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry.split
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.inc24
-  %i.07 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc25, %for.inc24 ]
-  %cmp23 = icmp sgt i32 %n, 0
-  br i1 %cmp23, label %for.cond4.preheader.lr.ph, label %for.inc24
-
-for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.preheader
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %for.cond4.preheader.lr.ph, %for.inc21
-  %j.04 = phi i32 [ 0, %for.cond4.preheader.lr.ph ], [ %inc22, %for.inc21 ]
-  %cmp51 = icmp sgt i32 %n, 0
-  br i1 %cmp51, label %for.body6.lr.ph, label %for.inc21
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.preheader
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6.lr.ph, %for.body6
-  %k.02 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ]
-  %idxprom = sext i32 %k.02 to i64
-  %idxprom7 = sext i32 %k.02 to i64
-  %arrayidx8 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom, i64 %idxprom7
-  %tmp9 = load i32, ptr %arrayidx8, align 4
-  %tmp12 = load i32, ptr %invariant, align 4
-  %idxprom9 = sext i32 %tmp12 to i64
-  %idxprom11 = sext i32 %k.02 to i64
-  %arrayidx12 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom9, i64 %idxprom11
-  store i32 %tmp9, ptr %arrayidx12, align 4
-  %idxprom13 = sext i32 %k.02 to i64
-  %idxprom15 = sext i32 %k.02 to i64
-  %arrayidx16 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom13, i64 %idxprom15
-  %tmp17 = load i32, ptr %arrayidx16, align 4
-  %idxprom17 = sext i32 %k.02 to i64
-  %tmp21 = load i32, ptr %invariant, align 4
-  %idxprom19 = sext i32 %tmp21 to i64
-  %arrayidx20 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom17, i64 %idxprom19
-  %tmp22 = load i32, ptr %arrayidx20, align 4
-  %add = add nsw i32 %tmp22, %tmp17
-  store i32 %add, ptr %arrayidx20, align 4
-  %inc = add nuw nsw i32 %k.02, 1
-  %cmp5 = icmp slt i32 %inc, %n
-  br i1 %cmp5, label %for.body6, label %for.cond4.for.inc21_crit_edge
-
-for.cond4.for.inc21_crit_edge:                    ; preds = %for.body6
-  br label %for.inc21
-
-for.inc21:                                        ; preds = %for.cond4.for.inc21_crit_edge, %for.cond4.preheader
-  %inc22 = add nuw nsw i32 %j.04, 1
-  %cmp2 = icmp slt i32 %inc22, %n
-  br i1 %cmp2, label %for.cond4.preheader, label %for.cond1.for.inc24_crit_edge
-
-for.cond1.for.inc24_crit_edge:                    ; preds = %for.inc21
-  br label %for.inc24
-
-for.inc24:                                        ; preds = %for.cond1.for.inc24_crit_edge, %for.cond1.preheader
-  %inc25 = add nuw nsw i32 %i.07, 1
-  %cmp = icmp slt i32 %inc25, %n
-  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end26_crit_edge
-
-for.cond.for.end26_crit_edge:                     ; preds = %for.inc24
-  br label %for.end26
-
-for.end26:                                        ; preds = %for.cond.for.end26_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-of-scalar.ll b/polly/test/GPGPU/invariant-load-of-scalar.ll
deleted file mode 100644
--- a/polly/test/GPGPU/invariant-load-of-scalar.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck -check-prefix=SCOP %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=HOST-IR %s
-
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-; Check that we offload invariant loads of scalars correctly.
-
-; Check that invariant loads are present.
-; SCOP:      Function: checkPrivatization
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp, tmp2] -> { Stmt_entry_split[] -> MemRef_begin[0] };
-; SCOP-NEXT:         Execution Context: [tmp, tmp2] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp, tmp2] -> { Stmt_for_body[i0] -> MemRef_end[0] };
-; SCOP-NEXT:         Execution Context: [tmp, tmp2] -> {  :  }
-; SCOP-NEXT: }
-;
-
-; Check that we do not actually allocate arrays for %begin, %end, since they are
-; invariant load hoisted.
-; HOST-IR: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice
-; HOST-IR-NOT: call ptr @polly_allocateMemoryForDevice
-
-; Check that we send the invariant loaded scalars as parameters to the
-; kernel function.
-; KERNEL-IR: define ptx_kernel void @FUNC_checkPrivatization_SCOP_0_KERNEL_0
-; KERNEL-IR-SAME: (ptr addrspace(1) %MemRef_A, i32 %tmp,
-; KERNEL-IR-SAME: i32 %tmp2, i32 %polly.access.begin.load)
-
-
-; void checkScalarPointerOffload(int A[], int *begin, int *end) {
-;     for(int i = *begin; i < *end; i++) {
-;         A[i] = 10;
-;     }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-define void @checkPrivatization(ptr %A, ptr %begin, ptr %end) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp = load i32, ptr %begin, align 4
-  %tmp21 = load i32, ptr %end, align 4
-  %cmp3 = icmp slt i32 %tmp, %tmp21
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  %tmp1 = sext i32 %tmp to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv4 = phi i64 [ %tmp1, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv4
-  store i32 10, ptr %arrayidx, align 4
-  %indvars.iv.next = add i64 %indvars.iv4, 1
-  %tmp2 = load i32, ptr %end, align 4
-  %tmp3 = sext i32 %tmp2 to i64
-  %cmp = icmp slt i64 %indvars.iv.next, %tmp3
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
deleted file mode 100644
--- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; REQUIRES: pollyacc
-;
-;    void kernel_params_only_some_arrays(float A[], float B[]) {
-;      for (long i = 0; i < 32; i++)
-;        A[i] += 42;
-;
-;      for (long i = 0; i < 32; i++)
-;        B[i] += 42;
-;    }
-
-; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0'
-; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0"
-; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
-
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B)
-; KERNEL-NEXT:   entry:
-; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
-; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-
-; KERNEL:     ret void
-; KERNEL-NEXT: }
-
-; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1'
-; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1"
-; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
-
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A)
-; KERNEL-NEXT:   entry:
-; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
-; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-
-; KERNEL:     ret void
-; KERNEL-NEXT: }
-
-
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
-; IR-NEXT:  [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
-; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
-; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
-; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]
-
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
-; IR-NEXT:  [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0
-; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
-; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
-; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @kernel_params_only_some_arrays(float* %A, float* %B) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp = load float, float* %arrayidx, align 4
-  %add = fadd float %tmp, 4.200000e+01
-  store float %add, float* %arrayidx, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %inc = add nuw nsw i64 %i.0, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  br label %for.cond2
-
-for.cond2:                                        ; preds = %for.inc7, %for.end
-  %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ]
-  %exitcond = icmp ne i64 %i1.0, 32
-  br i1 %exitcond, label %for.body4, label %for.end9
-
-for.body4:                                        ; preds = %for.cond2
-  %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0
-  %tmp2 = load float, float* %arrayidx5, align 4
-  %add6 = fadd float %tmp2, 4.200000e+01
-  store float %add6, float* %arrayidx5, align 4
-  br label %for.inc7
-
-for.inc7:                                         ; preds = %for.body4
-  %inc8 = add nuw nsw i64 %i1.0, 1
-  br label %for.cond2
-
-for.end9:                                         ; preds = %for.cond2
-  ret void
-}
diff --git a/polly/test/GPGPU/kernel-params-scop-parameter.ll b/polly/test/GPGPU/kernel-params-scop-parameter.ll
deleted file mode 100644
--- a/polly/test/GPGPU/kernel-params-scop-parameter.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-;    void kernel_params_scop_parameter(float A[], long n) {
-;      for (long i = 0; i < n; i++)
-;        A[i] += 42;
-;    }
-
-; KERNEL-IR: define ptx_kernel void @FUNC_kernel_params_scop_parameter_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n)
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @kernel_params_scop_parameter(ptr %A, i64 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb8
-
-bb2:                                              ; preds = %bb1
-  %tmp3 = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp4 = load float, ptr %tmp3, align 4
-  %tmp5 = fadd float %tmp4, 4.200000e+01
-  store float %tmp5, ptr %tmp3, align 4
-  br label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb8:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll
deleted file mode 100644
--- a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-kernel-ir -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 {
-; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_1_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 {
-; KERNEL: define ptx_kernel void @FUNC_foo2_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 {
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; Function Attrs: nounwind uwtable
-define void @foo(i32 %arg, ptr %arg1) #0 {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb
-  %tmp = icmp sgt i32 %arg, 0
-  br i1 %tmp, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb4, %bb3
-  %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ]
-  %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5
-  %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2
-  %tmp8 = add nsw i32 %tmp7, 1
-  store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2
-  %tmp9 = add nuw nsw i64 %tmp5, 1
-  %tmp10 = zext i32 %arg to i64
-  %tmp11 = icmp ne i64 %tmp9, %tmp10
-  br i1 %tmp11, label %bb4, label %bb12
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12, %bb2
-  %tmp14 = tail call i64 @clock() #3
-  %tmp15 = icmp eq i64 %tmp14, 0
-  br i1 %tmp15, label %bb16, label %bb29
-
-bb16:                                             ; preds = %bb13
-  %tmp17 = icmp sgt i32 %arg, 0
-  br i1 %tmp17, label %bb18, label %bb28
-
-bb18:                                             ; preds = %bb16
-  br label %bb19
-
-bb19:                                             ; preds = %bb19, %bb18
-  %tmp20 = phi i64 [ 0, %bb18 ], [ %tmp24, %bb19 ]
-  %tmp21 = getelementptr inbounds i32, ptr %arg1, i64 %tmp20
-  %tmp22 = load i32, ptr %tmp21, align 4, !tbaa !2
-  %tmp23 = add nsw i32 %tmp22, 1
-  store i32 %tmp23, ptr %tmp21, align 4, !tbaa !2
-  %tmp24 = add nuw nsw i64 %tmp20, 1
-  %tmp25 = zext i32 %arg to i64
-  %tmp26 = icmp ne i64 %tmp24, %tmp25
-  br i1 %tmp26, label %bb19, label %bb27
-
-bb27:                                             ; preds = %bb19
-  br label %bb28
-
-bb28:                                             ; preds = %bb27, %bb16
-  br label %bb29
-
-bb29:                                             ; preds = %bb28, %bb13
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
-
-; Function Attrs: nounwind
-declare i64 @clock() #2
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
-
-; Function Attrs: nounwind uwtable
-define void @foo2(i32 %arg, ptr %arg1) #0 {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb
-  %tmp = icmp sgt i32 %arg, 0
-  br i1 %tmp, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb4, %bb3
-  %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ]
-  %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5
-  %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2
-  %tmp8 = add nsw i32 %tmp7, 1
-  store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2
-  %tmp9 = add nuw nsw i64 %tmp5, 1
-  %tmp10 = zext i32 %arg to i64
-  %tmp11 = icmp ne i64 %tmp9, %tmp10
-  br i1 %tmp11, label %bb4, label %bb12
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12, %bb2
-  ret void
-}
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{!"clang version 5.0.0"}
-!2 = !{!3, !3, i64 0}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll b/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll
deleted file mode 100644
--- a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; RUN: opt %loadPolly -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR
-; RUN: opt %loadPolly -S -polly-codegen-ppcg  < %s \
-; RUN: -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll \
-; RUN:     | FileCheck %s --check-prefix=HOST-IR
-
-; Test that we do recognise and codegen a kernel that has functions that can
-; be mapped to NVIDIA's libdevice
-
-; REQUIRES: pollyacc
-
-; Check that we model the kernel as a scop.
-; SCOP:      Function: f
-; SCOP-NEXT:       Region: %entry.split---%for.end
-
-; Check that the intrinsic call is present in the kernel IR.
-; KERNEL-IR:   %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_)
-; KERNEL-IR:   %p_cosf = tail call float @__nv_cosf(float %p_expf)
-; KERNEL-IR:   %p_logf = tail call float @__nv_logf(float %p_cosf)
-
-; Powi and exp cannot be lowered directly. Rather, we expect them to be
-; lowered by libdevice.
-; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2)
-; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi)
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-
-; void f(float *A, float *B, int N) {
-;   for(int i = 0; i < N; i++) {
-;       float tmp0 = A[i];
-;       float expf  = expf(tmp1);
-;       cosf = cosf(expf);
-;       logf = logf(cosf);
-;       powi = powi(logf, 2);
-;       exp = exp(powi);
-;       B[i] = logf;
-;   }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(ptr %A, ptr %B, i32 %N) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp1 = icmp sgt i32 %N, 0
-  br i1 %cmp1, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %A.arr.i = getelementptr inbounds float, ptr %A, i64 %indvars.iv
-  %A.arr.i.val = load float, ptr %A.arr.i, align 4
-  ; Call to intrinsics that should be part of the kernel.
-  %expf = tail call float @expf(float %A.arr.i.val)
-  %cosf = tail call float @cosf(float %expf)
-  %logf = tail call float @logf(float %cosf)
-  %powi = tail call float @llvm.powi.f32.i32(float %logf, i32 2)
-  %exp = tail call float @llvm.exp.f32(float %powi)
-  %B.arr.i = getelementptr inbounds float, ptr %B, i64 %indvars.iv
-  store float %exp, ptr %B.arr.i, align 4
-
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %wide.trip.count = zext i32 %N to i64
-  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare float @expf(float) #0
-declare float @cosf(float) #0
-declare float @logf(float) #0
-declare float @llvm.powi.f32.i32(float, i32) #0
-declare float @llvm.exp.f32(float) #0
-
-attributes #0 = { nounwind readnone }
-
diff --git a/polly/test/GPGPU/live-range-reordering-with-privatization.ll b/polly/test/GPGPU/live-range-reordering-with-privatization.ll
deleted file mode 100644
--- a/polly/test/GPGPU/live-range-reordering-with-privatization.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-  ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
-; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-code -disable-output \
-; RUN:   < %s | FileCheck %s -check-prefix=CODE
-
-; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
-; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-kernel-ir -disable-output \
-; RUN:   < %s | FileCheck %s -check-prefix=KERNELIR
-
-; REQUIRES: pollyacc
-
-;    void f(const int *end, int *arr, const int *control, const int *readarr) {
-;      for (int i = 0; i < *end; i++) {
-;        int t = 0;
-;        if (*control > 3) {
-;          t += readarr[i];
-;        }
-;        arr[i] = t;
-;      }
-;    }
-
-; This test case tests the ability to infer that `t` is local to each loop
-; iteration, and can therefore be privatized.
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1)
-; CODE-NEXT:   if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) {
-; CODE-NEXT:     Stmt_for_body_last(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:     if (tmp1 >= 4)
-; CODE-NEXT:       Stmt_if_then(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:     Stmt_if_end(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:   }
-
-; KERNELIR: %private_array = alloca i32
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-
-define void @f(ptr %end, ptr %arr, ptr %control, ptr %readarr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp3 = load i32, ptr %end, align 4
-  %cmp4 = icmp sgt i32 %tmp3, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %if.end
-  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
-  %tmp1 = load i32, ptr %control, align 4
-  %cmp1 = icmp sgt i32 %tmp1, 3
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, ptr %readarr, i32 %i.05
-  %tmp2 = load i32, ptr %arrayidx, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i32 %i.05
-  store i32 %t.0, ptr %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.05, 1
-  %tmp = load i32, ptr %end, align 4
-  %cmp = icmp slt i32 %inc, %tmp
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %if.end
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
diff --git a/polly/test/GPGPU/loops-outside-scop.ll b/polly/test/GPGPU/loops-outside-scop.ll
deleted file mode 100644
--- a/polly/test/GPGPU/loops-outside-scop.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; There is no FileCheck because we want to make sure that this doesn't crash.
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \
-; RUN: -disable-output < %s
-
-; REQUIRES: pollyacc
-
-; Due to the existence of the `fence` call, We can only detect the inner loop
-; and not the outer loop. PPCGCodeGeneration had not implemented this case.
-; The fix was to pull the implementation from `IslNodeBuilder.
-
-; Make sure that we only capture the inner loop
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for2.body---%for2.body.fence
-; SCOP-NEXT: Max Loop Depth:  1
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @fn_to_fence(ptr %val)
-
-; void f(int *arr, bool shouldcont) {
-;     for(int i = 0; ; i++) {
-;         for(int j = 0; j < 10; j++) {
-;             arr[j] = i;
-;         }
-;         fence(arr);
-;         if (!shouldcont) break;
-;     }
-; }
-
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr %arr, i1 %shouldcont) #1 {
-entry:
-  br label %for.init
-
-for.init:                                             ; preds = %for.end, %entry.split
-  %i = phi i32 [ %i.next, %for.end ], [ 0, %entry ]
-  br label %for2.body
-
-for2.body:                                             ; preds = %"65", %"64"
-  %j = phi i32 [ %j.next, %for2.body ], [ 0, %for.init ]
-  %j.sext = sext i32 %j to i64
-  %arr.slot = getelementptr i32, ptr %arr, i64 %j.sext
-  store i32 %i, ptr %arr.slot, align 4
-  %exitcond = icmp eq i32 %j, 10
-  %j.next = add i32 %j, 1
-  br i1 %exitcond, label %for2.body.fence, label %for2.body
-
-for2.body.fence:                                             ; preds = %"65"
-  call void @fn_to_fence(ptr %arr) #2
-  br i1 %shouldcont, label %for.end, label %exit
-for.end:                                             ; preds = %"69"
-  %i.next = add i32 %i, 1
-  br label %for.init
-
-exit:                                             ; preds = %"69"
-  ret void
-
-}
-
-
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable }
-attributes #2 = { nounwind }
diff --git a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll b/polly/test/GPGPU/managed-memory-rewrite-alloca.ll
deleted file mode 100644
--- a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 \
-; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory  -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP: i32 MemRef_arr[*];
-
-; Check that we generate a constructor call for @A.toptr
-; HOST-IR-NOT:   %arr = alloca [100 x i32]
-
-source_filename = "test.c"
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-
-define void @f() {
-entry:
-  %arr = alloca [100 x i32]
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [100 x i32], ptr %arr, i64 0, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll
deleted file mode 100644
--- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR
-;
-; REQUIRES: pollyacc
-;
-; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and
-; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory`
-; pass, even inside `constantExpr`. This is necessary because a cookie cutter
-; Inst->replaceUsesOfWith(...) call does not actually work, because this does
-; not replace the instruction within a ConstantExpr.
-;
-; #include <memory.h>
-;
-; static const int N = 100;
-; int* f(int *ToFree) {
-;     free(ToFree);
-;     int *A = (int *)malloc(sizeof(int) * N);
-;     for(int i = 0; i < N; i++) {
-;         A[i] = 42;
-;     }
-;     return A;
-;
-; }
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; SCOP:      Arrays {
-; SCOP-NEXT:     i32 MemRef_tmp[*]; // Element size 4
-; SCOP-NEXT: }
-
-; // Check that polly_mallocManaged is declared and used correctly.
-; HOST-IR: declare ptr @polly_mallocManaged(i64)
-
-; // Check that polly_freeManaged is declared and used correctly.
-; HOST-IR  call void @polly_freeManaged(i8* %toFree)
-; HOST-IR: declare void @polly_freeManaged(ptr)
-
-; // Check that we remove the original malloc,free
-; HOST-IR-NOT: declare ptr @malloc(i64)
-; HOST-IR-NOT: declare void @free(ptr)
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-define ptr @f(ptr %toFree) {
-entry:
-  ; Free inside bitcast
-  call void @free (ptr %toFree)
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  ; malloc inside bitcast.
-  %tmp = call ptr @malloc (i64 400)
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %tmp, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret ptr %tmp
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-declare ptr @malloc(i64)
-declare void @free(ptr)
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll
deleted file mode 100644
--- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR
-;
-; REQUIRES: pollyacc
-;
-; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and
-; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory`
-; pass.
-;
-; #include <memory.h>
-;
-; static const int N = 100;
-; int* f(int *ToFree) {
-;     free(ToFree);
-;     int *A = (int *)malloc(sizeof(int) * N);
-;     for(int i = 0; i < N; i++) {
-;         A[i] = 42;
-;     }
-;     return A;
-;
-; }
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; SCOP:      Arrays {
-; SCOP-NEXT:     i32 MemRef_call[*]; // Element size 4
-; SCOP-NEXT: }
-
-; // Check that polly_mallocManaged is declared and used correctly.
-; HOST-IR: %call = tail call ptr @polly_mallocManaged(i64 400)
-; HOST-IR: declare ptr @polly_mallocManaged(i64)
-
-; // Check that polly_freeManaged is declared and used correctly.
-; HOST-IR  %toFreeBitcast = bitcast i32* %toFree to i8*
-; HOST-IR  call void @polly_freeManaged(i8* %toFreeBitcast)
-; HOST-IR: declare void @polly_freeManaged(ptr)
-
-; // Check that we remove the original malloc,free
-; HOST-IR-NOT: declare ptr @malloc(i64)
-; HOST-IR-NOT: declare void @free(ptr)
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-define ptr @f(ptr %toFree) {
-entry:
-  call void @free(ptr %toFree)
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %call = tail call ptr @malloc(i64 400)
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret ptr %call
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-declare ptr @malloc(i64)
-declare void @free(ptr)
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/memory-only-referenced-from-access.ll b/polly/test/GPGPU/memory-only-referenced-from-access.ll
deleted file mode 100644
--- a/polly/test/GPGPU/memory-only-referenced-from-access.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -polly-invariant-load-hoisting -polly-ignore-aliasing \
-; RUN: -polly-process-unprofitable -polly-ignore-parameter-bounds \
-; RUN: -polly-acc-fail-on-verify-module-failure \
-; RUN: -polly-acc-codegen-managed-memory \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-; Verify that we correctly generate a kernel even if certain invariant load
-; hoisted parameters appear only in memory accesses, but not domain elements.
-
-; CHECK: @FUNC_quux_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_tmp4, i32 %tmp3, i32 %tmp, i32 %tmp31, i32 %tmp2)
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.hoge = type { ptr, i64, i64, [1 x %struct.widget] }
-%struct.widget = type { i64, i64, i64 }
-
-@global = external unnamed_addr global %struct.hoge, align 32
-
-define void @quux(ptr noalias %arg, ptr noalias %arg1) {
-bb:
-  %tmp = load i32, ptr %arg, align 4
-  %tmp2 = sext i32 %tmp to i64
-  %tmp3 = load i32, ptr %arg1, align 4
-  %tmp4 = load ptr, ptr @global, align 32
-  br label %bb5
-
-bb5:                                              ; preds = %bb5, %bb
-  %tmp6 = phi i32 [ %tmp11, %bb5 ], [ 0, %bb ]
-  %tmp7 = sext i32 %tmp6 to i64
-  %tmp8 = sub nsw i64 %tmp7, %tmp2
-  %tmp9 = getelementptr [0 x double], ptr %tmp4, i64 0, i64 %tmp8
-  store double undef, ptr %tmp9, align 8
-  %tmp10 = icmp eq i32 %tmp6, %tmp3
-  %tmp11 = add i32 %tmp6, 1
-  br i1 %tmp10, label %bb12, label %bb5
-
-bb12:                                             ; preds = %bb5
-  ret void
-}
diff --git a/polly/test/GPGPU/mostly-sequential.ll b/polly/test/GPGPU/mostly-sequential.ll
deleted file mode 100644
--- a/polly/test/GPGPU/mostly-sequential.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-;    void foo(float A[]) {
-;      for (long i = 0; i < 128; i++)
-;        A[i] += i;
-;
-;      for (long i = 0; i < 128; i++)
-;        for (long j = 0; j < 128; j++)
-;          A[42] += i + j;
-;    }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(4);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:            {
-; CODE-NEXT:         dim3 k1_dimBlock;
-; CODE-NEXT:         dim3 k1_dimGrid;
-; CODE-NEXT:         kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:         cudaCheckKernel();
-; CODE-NEXT:       }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (128) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb4(32 * b0 + t0);
-
-; CODE: # kernel1
-; CODE-NEXT: for (int c0 = 0; c0 <= 127; c0 += 1)
-; CODE-NEXT:   for (int c1 = 0; c1 <= 127; c1 += 1)
-; CODE-NEXT:     Stmt_bb14(c0, c1);
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A) {
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb8, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ]
-  %exitcond2 = icmp ne i64 %i.0, 128
-  br i1 %exitcond2, label %bb4, label %bb10
-
-bb4:                                              ; preds = %bb3
-  %tmp = sitofp i64 %i.0 to float
-  %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp6 = load float, ptr %tmp5, align 4
-  %tmp7 = fadd float %tmp6, %tmp
-  store float %tmp7, ptr %tmp5, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb4
-  %tmp9 = add nuw nsw i64 %i.0, 1
-  br label %bb3
-
-bb10:                                             ; preds = %bb3
-  br label %bb11
-
-bb11:                                             ; preds = %bb23, %bb10
-  %i1.0 = phi i64 [ 0, %bb10 ], [ %tmp24, %bb23 ]
-  %exitcond1 = icmp ne i64 %i1.0, 128
-  br i1 %exitcond1, label %bb12, label %bb25
-
-bb12:                                             ; preds = %bb11
-  br label %bb13
-
-bb13:                                             ; preds = %bb20, %bb12
-  %j.0 = phi i64 [ 0, %bb12 ], [ %tmp21, %bb20 ]
-  %exitcond = icmp ne i64 %j.0, 128
-  br i1 %exitcond, label %bb14, label %bb22
-
-bb14:                                             ; preds = %bb13
-  %tmp15 = add nuw nsw i64 %i1.0, %j.0
-  %tmp16 = sitofp i64 %tmp15 to float
-  %tmp17 = getelementptr inbounds float, ptr %A, i64 42
-  %tmp18 = load float, ptr %tmp17, align 4
-  %tmp19 = fadd float %tmp18, %tmp16
-  store float %tmp19, ptr %tmp17, align 4
-  br label %bb20
-
-bb20:                                             ; preds = %bb14
-  %tmp21 = add nuw nsw i64 %j.0, 1
-  br label %bb13
-
-bb22:                                             ; preds = %bb13
-  br label %bb23
-
-bb23:                                             ; preds = %bb22
-  %tmp24 = add nuw nsw i64 %i1.0, 1
-  br label %bb11
-
-bb25:                                             ; preds = %bb11
-  ret void
-}
diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll
deleted file mode 100644
--- a/polly/test/GPGPU/non-read-only-scalars.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-;
-; REQUIRES: pollyacc
-;
-; #include <stdio.h>
-;
-; float foo(float A[]) {
-;   float sum = 0;
-;
-;   for (long i = 0; i < 32; i++)
-;     A[i] = i;
-;
-;   for (long i = 0; i < 32; i++)
-;     A[i] += i;
-;
-;   for (long i = 0; i < 32; i++)
-;     sum += A[i];
-;
-;   return sum;
-; }
-;
-; int main() {
-;   float A[32];
-;   float sum = foo(A);
-;   printf("%f\n", sum);
-; }
-
-; CODE:          dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(1);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   {
-; CODE-NEXT:     dim3 k1_dimBlock;
-; CODE-NEXT:     dim3 k1_dimGrid;
-; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_sum_0__phi);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:          {
-; CODE-NEXT:       dim3 k2_dimBlock;
-; CODE-NEXT:       dim3 k2_dimGrid;
-; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_sum_0__phi));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_sum_0));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: {
-; CODE-NEXT:   Stmt_bb4(t0);
-; CODE-NEXT:   Stmt_bb10(t0);
-; CODE-NEXT: }
-
-; CODE: # kernel1
-; CODE-NEXT: Stmt_bb17();
-
-; CODE: # kernel2
-; TODO-NEXT: {
-; TODO-NEXT:   read();
-; TODO-NEXT:   for (int c0 = 0; c0 <= 32; c0 += 1) {
-; TODO-NEXT:     Stmt_bb18(c0);
-; TODO-NEXT:     if (c0 <= 31)
-; TODO-NEXT:       Stmt_bb20(c0);
-; TODO-NEXT:   }
-; TODO-NEXT:   write();
-; TODO-NEXT: }
-
-
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(ptr addrspace(1) %MemRef_sum_0__phi)
-; KERNEL-IR:  store float 0.000000e+00, ptr %sum.0.phiops
-; KERNEL-IR:  [[REGA:%.+]] = addrspacecast ptr addrspace(1) %MemRef_sum_0__phi to ptr
-; KERNEL-IR:  [[REGB:%.+]] = load float, ptr %sum.0.phiops
-; KERNEL-IR:  store float [[REGB]], ptr [[REGA]]
-
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2(ptr addrspace(1) %MemRef_A, ptr addrspace(1) %MemRef_sum_0__phi, ptr addrspace(1) %MemRef_sum_0)
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1
-
-define float @foo(ptr %A) {
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %exitcond2 = icmp ne i64 %i.0, 32
-  br i1 %exitcond2, label %bb4, label %bb8
-
-bb4:                                              ; preds = %bb3
-  %tmp = sitofp i64 %i.0 to float
-  %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0
-  store float %tmp, ptr %tmp5, align 4
-  br label %bb6
-
-bb6:                                              ; preds = %bb4
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb3
-
-bb8:                                              ; preds = %bb3
-  br label %bb9
-
-bb9:                                              ; preds = %bb15, %bb8
-  %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ]
-  %exitcond1 = icmp ne i64 %i1.0, 32
-  br i1 %exitcond1, label %bb10, label %bb17
-
-bb10:                                             ; preds = %bb9
-  %tmp11 = sitofp i64 %i1.0 to float
-  %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0
-  %tmp13 = load float, ptr %tmp12, align 4
-  %tmp14 = fadd float %tmp13, %tmp11
-  store float %tmp14, ptr %tmp12, align 4
-  br label %bb15
-
-bb15:                                             ; preds = %bb10
-  %tmp16 = add nuw nsw i64 %i1.0, 1
-  br label %bb9
-
-bb17:                                             ; preds = %bb9
-  br label %bb18
-
-bb18:                                             ; preds = %bb20, %bb17
-  %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ]
-  %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ]
-  %exitcond = icmp ne i64 %i2.0, 32
-  br i1 %exitcond, label %bb19, label %bb25
-
-bb19:                                             ; preds = %bb18
-  br label %bb20
-
-bb20:                                             ; preds = %bb19
-  %tmp21 = getelementptr inbounds float, ptr %A, i64 %i2.0
-  %tmp22 = load float, ptr %tmp21, align 4
-  %tmp23 = fadd float %sum.0, %tmp22
-  %tmp24 = add nuw nsw i64 %i2.0, 1
-  br label %bb18
-
-bb25:                                             ; preds = %bb18
-  %sum.0.lcssa = phi float [ %sum.0, %bb18 ]
-  ret float %sum.0.lcssa
-}
-
-define i32 @main() {
-bb:
-  %A = alloca [32 x float], align 16
-  %tmp1 = call float @foo(ptr %A)
-  %tmp2 = fpext float %tmp1 to double
-  %tmp3 = call i32 (ptr, ...) @printf(ptr @.str, double %tmp2) #2
-  ret i32 0
-}
-
-declare i32 @printf(ptr, ...) #1
-
diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll
deleted file mode 100644
--- a/polly/test/GPGPU/non-zero-array-offset.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-;
-; REQUIRES: pollyacc
-
-; CODE:      cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice));
-
-; CODE:          dim3 k0_dimBlock(8);
-; CODE-NEXT:     dim3 k0_dimGrid(1);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:        {
-; CODE-NEXT:     dim3 k1_dimBlock(8);
-; CODE-NEXT:     dim3 k1_dimGrid(1);
-; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb3(t0);
-
-; CODE: # kernel1
-; CODE-NEXT: Stmt_bb11(t0);
-
-; IR:       %p_dev_array_MemRef_B = call ptr @polly_allocateMemoryForDevice(i64 32)
-; IR-NEXT:  %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 32)
-; IR-NEXT:  [[REG0:%.+]] = getelementptr float, ptr %B, i64 8
-; IR-NEXT:  call void @polly_copyFromHostToDevice(ptr [[REG0]], ptr %p_dev_array_MemRef_B, i64 32)
-
-; IR:      [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_B)
-; IR-NEXT: [[REGC:%.+]]  = getelementptr float, ptr [[REGA]], i64 -8
-
-;    void foo(float A[], float B[]) {
-;      for (long i = 0; i < 8; i++)
-;        B[i + 8] *= 4;
-;
-;      for (long i = 0; i < 8; i++)
-;        A[i] *= 12;
-;    }
-;
-;    #ifdef OUTPUT
-;    int main() {
-;      float A[16];
-;
-;      for (long i = 0; i < 16; i++) {
-;        __sync_synchronize();
-;        A[i] = i;
-;      }
-;
-;      foo(A, A);
-;
-;      float sum = 0;
-;      for (long i = 0; i < 16; i++) {
-;        __sync_synchronize();
-;        sum += A[i];
-;      }
-;
-;      printf("%f\n", sum);
-;    }
-;    #endif
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb7, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
-  %exitcond1 = icmp ne i64 %i.0, 8
-  br i1 %exitcond1, label %bb3, label %bb9
-
-bb3:                                              ; preds = %bb2
-  %tmp = add nuw nsw i64 %i.0, 8
-  %tmp4 = getelementptr inbounds float, ptr %B, i64 %tmp
-  %tmp5 = load float, ptr %tmp4, align 4
-  %tmp6 = fmul float %tmp5, 4.000000e+00
-  store float %tmp6, ptr %tmp4, align 4
-  br label %bb7
-
-bb7:                                              ; preds = %bb3
-  %tmp8 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb9:                                              ; preds = %bb2
-  br label %bb10
-
-bb10:                                             ; preds = %bb15, %bb9
-  %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ]
-  %exitcond = icmp ne i64 %i1.0, 8
-  br i1 %exitcond, label %bb11, label %bb17
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0
-  %tmp13 = load float, ptr %tmp12, align 4
-  %tmp14 = fmul float %tmp13, 1.200000e+01
-  store float %tmp14, ptr %tmp12, align 4
-  br label %bb15
-
-bb15:                                             ; preds = %bb11
-  %tmp16 = add nuw nsw i64 %i1.0, 1
-  br label %bb10
-
-bb17:                                             ; preds = %bb10
-  ret void
-}
diff --git a/polly/test/GPGPU/only-part-of-array-modified.ll b/polly/test/GPGPU/only-part-of-array-modified.ll
deleted file mode 100644
--- a/polly/test/GPGPU/only-part-of-array-modified.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-;
-; REQUIRES: pollyacc
-;
-;    void foo(float A[], float B[]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[2 * i] = B[i];
-;    }
-
-; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i32), cudaMemcpyHostToDevice));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2047) * sizeof(i32), cudaMemcpyHostToDevice));
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb8, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb10
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds float, ptr %B, i64 %i.0
-  %tmp4 = load i32, ptr %tmp, align 4
-  %tmp5 = shl nsw i64 %i.0, 1
-  %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5
-  store i32 %tmp4, ptr %tmp6, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb2
-  %tmp9 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb10:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/parametric-loop-bound.ll b/polly/test/GPGPU/parametric-loop-bound.ll
deleted file mode 100644
--- a/polly/test/GPGPU/parametric-loop-bound.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; REQUIRES: pollyacc
-
-;    void foo(long A[], long n) {
-;      for (long i = 0; i < n; i++)
-;        A[i] += 100;
-;    }
-
-; CODE: if (n >= 1) {
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (n) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(n >= 1048545 ? 32768 : (n + 31) / 32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, n);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (n) * sizeof(i64), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c0 = 0; c0 <= (n - 32 * b0 - 1) / 1048576; c0 += 1)
-; CODE-NEXT:   if (n >= 32 * b0 + t0 + 1048576 * c0 + 1)
-; CODE-NEXT:     Stmt_bb2(32 * b0 + t0 + 1048576 * c0);
-
-; IR: store i64 %n, ptr %polly_launch_0_param_1
-; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1
-; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGA]]
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, i64 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb8
-
-bb2:                                              ; preds = %bb1
-  %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp4 = load i64, ptr %tmp3, align 8
-  %tmp5 = add nsw i64 %tmp4, 100
-  store i64 %tmp5, ptr %tmp3, align 8
-  br label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb8:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/partial_writes.ll b/polly/test/GPGPU/partial_writes.ll
deleted file mode 100644
--- a/polly/test/GPGPU/partial_writes.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-codegen-ppcg -polly-stmt-granularity=bb -S < %s \
-; RUN: | FileCheck %s
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK: polly_launchKernel
-
-; Function Attrs: nounwind uwtable
-define void @partial_writes() {
-bb:
-  %tmp = tail call ptr @wibble() #2
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %tmp3 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %tmp4 = getelementptr inbounds [1200 x double], ptr %tmp, i64 0, i64 %tmp3
-  %tmp5 = load double, ptr %tmp4, align 8, !tbaa !1
-  br label %bb6
-
-bb6:                                              ; preds = %bb6, %bb2
-  %tmp7 = phi double [ undef, %bb2 ], [ undef, %bb6 ]
-  %tmp8 = phi i64 [ 0, %bb2 ], [ %tmp9, %bb6 ]
-  store double undef, ptr %tmp4, align 8, !tbaa !1
-  %tmp9 = add nuw nsw i64 %tmp8, 1
-  %tmp10 = icmp eq i64 %tmp9, 900
-  br i1 %tmp10, label %bb11, label %bb6
-
-bb11:                                             ; preds = %bb6
-  %tmp12 = add nuw nsw i64 %tmp3, 1
-  %tmp13 = icmp eq i64 %tmp12, 1200
-  br i1 %tmp13, label %bb14, label %bb2
-
-bb14:                                             ; preds = %bb11
-  ret void
-}
-
-declare ptr @wibble()
-
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 6.0.0 (trunk 309912) (llvm/trunk 309933)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"double", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop b/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop
deleted file mode 100644
--- a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-   "arrays" : [
-      {
-         "name" : "MemRef_tmp",
-         "sizes" : [ "*" ],
-         "type" : "double"
-      }
-   ],
-   "context" : "{  :  }",
-   "name" : "%bb2---%bb14",
-   "statements" : [
-      {
-         "accesses" : [
-            {
-               "kind" : "read",
-               "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }"
-            },
-            {
-               "kind" : "write",
-               "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }"
-            }
-         ],
-         "domain" : "{ Stmt_bb2[i0] : 0 <= i0 <= 1199 }",
-         "name" : "Stmt_bb2",
-         "schedule" : "{ Stmt_bb2[i0] -> [i0, 0, 0] }"
-      },
-      {
-         "accesses" : [
-            {
-               "kind" : "write",
-               "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] : i1 <= 898 }"
-            },
-            {
-               "kind" : "read",
-               "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }"
-            },
-            {
-               "kind" : "write",
-               "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }"
-            }
-         ],
-         "domain" : "{ Stmt_bb6[i0, i1] : 0 <= i0 <= 1199 and 0 <= i1 <= 899 }",
-         "name" : "Stmt_bb6",
-         "schedule" : "{ Stmt_bb6[i0, i1] -> [i0, 1, i1] }"
-      }
-   ]
-}
diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll
deleted file mode 100644
--- a/polly/test/GPGPU/phi-nodes-in-kernel.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; REQUIRES: pollyacc
-
-; Approximate C source:
-; void kernel_dynprog(int c[50]) {
-;     int iter = 0;
-;     int outl = 0;
-;
-;      while(1) {
-;         for(int indvar = 1 ; indvar <= 49; indvar++) {
-;             c[indvar] = undef;
-;         }
-;         add78 = c[49] + outl;
-;         inc80 = iter + 1;
-;
-;         if (true) break;
-;
-;         outl = add78;
-;         iter = inc80;
-;      }
-;}
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CODE:       cudaCheckReturn(cudaMalloc((void **) &dev_MemRef_c, (50) * sizeof(i32)));
-
-; CODE:       {
-; CODE-NEXT:    dim3 k0_dimBlock(32);
-; CODE-NEXT:    dim3 k0_dimGrid(2);
-; CODE-NEXT:    kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_c);
-; CODE-NEXT:    cudaCheckKernel();
-; CODE-NEXT:  }
-
-; CODE:       cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT:  cudaCheckReturn(cudaFree(dev_MemRef_c));
-
-; CODE: # kernel0
-; CODE-NEXT: if (32 * b0 + t0 <= 48)
-; CODE-NEXT:     Stmt_for_body17(0, 32 * b0 + t0);
-
-; IR-LABEL: call void @polly_freeKernel
-; IR:       [[REGC:%.+]] =   bitcast i32* %{{[0-9]+}} to i8*
-; IR-NEXT:  call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196)
-
-; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c) #0 {
-; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %9
-; KERNEL-IR-NEXT: store i32 422, i32 addrspace(1)* %polly.access.MemRef_c, align 4
-
-define void @kernel_dynprog([50 x i32]* %c) {
-entry:
-  %arrayidx77 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 49
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.cond15.for.cond12.loopexit_crit_edge, %entry
-  %out_l.055 = phi i32 [ 0, %entry ], [ %add78, %for.cond15.for.cond12.loopexit_crit_edge ]
-  %iter.054 = phi i32 [ 0, %entry ], [ %inc80, %for.cond15.for.cond12.loopexit_crit_edge ]
-  br label %for.body17
-
-for.cond15.for.cond12.loopexit_crit_edge:         ; preds = %for.body17
-  %tmp = load i32, i32* %arrayidx77, align 4
-  %add78 = add nsw i32 %tmp, %out_l.055
-  %inc80 = add nuw nsw i32 %iter.054, 1
-  br i1 false, label %for.cond1.preheader, label %for.end81
-
-for.body17:                                       ; preds = %for.body17, %for.cond1.preheader
-  %indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ]
-  %arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71
-  store i32 422, i32* %arrayidx69, align 4
-  %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
-  %lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32
-  %exitcond75 = icmp ne i32 %lftr.wideiv74, 50
-  br i1 %exitcond75, label %for.body17, label %for.cond15.for.cond12.loopexit_crit_edge
-
-for.end81:                                        ; preds = %for.cond15.for.cond12.loopexit_crit_edge
-  ret void
-}
diff --git a/polly/test/GPGPU/private-memory.ll b/polly/test/GPGPU/private-memory.ll
deleted file mode 100644
--- a/polly/test/GPGPU/private-memory.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-private \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-acc-use-private \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-;    void add(float *A) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 10; j++)
-;          A[i] += 1;
-;    }
-
-; CODE: # kernel0
-; CODE: {
-; CODE:     read(t0);
-; CODE:     for (int c3 = 0; c3 <= 9; c3 += 1)
-; CODE:       Stmt_bb5(t0, c3);
-; CODE:     write(t0);
-; CODE: }
-
-; KERNEL: %private_array = alloca [1 x float]
-
-; KERNEL:   %polly.access.cast.private_array = bitcast [1 x float]* %private_array to float*
-; KERNEL-NEXT:   %polly.access.private_array = getelementptr float, float* %polly.access.cast.private_array, i64 0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0
-; KERNEL-NEXT:   %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A
-; KERNEL-NEXT:   store float %shared.read, float* %polly.access.private_array
-
-; KERNEL:   %polly.access.cast.private_array5 = bitcast [1 x float]* %private_array to float*
-; KERNEL-NEXT:   %polly.access.private_array6 = getelementptr float, float* %polly.access.cast.private_array5, i64 0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A7 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A8 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A7, i64 %t0
-; KERNEL-NEXT:   %shared.write = load float, float* %polly.access.private_array6
-; KERNEL-NEXT:   store float %shared.write, float addrspace(1)* %polly.access.MemRef_A8
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @add(float* %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb8, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %j.0, 10
-  br i1 %exitcond, label %bb5, label %bb10
-
-bb5:                                              ; preds = %bb4
-  %tmp = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp6 = load float, float* %tmp, align 4
-  %tmp7 = fadd float %tmp6, 1.000000e+00
-  store float %tmp7, float* %tmp, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb5
-  %tmp9 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb10:                                             ; preds = %bb4
-  br label %bb11
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb13:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll
deleted file mode 100644
--- a/polly/test/GPGPU/privatization-simple.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; void f(int A[], int B[], int control, int C[]) {
-;     int x;
-; #pragma scop
-;     for(int i = 0; i < 1000; i ++) {
-;         x = 0;
-;         if(control) x = C[i];
-;         B[i] = x * A[i];
-;
-;     }
-; #pragma endscop
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(ptr %A, ptr %B, i32 %control, ptr %C) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %if.end
-  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
-  %tobool = icmp eq i32 %control, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
-  %tmp4 = load i32, ptr %arrayidx, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %for.body, %if.then
-  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %tmp8 = load i32, ptr %arrayidx2, align 4
-  %mul = mul nsw i32 %tmp8, %x.0
-  %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
-  store i32 %mul, ptr %arrayidx4, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.end:                                          ; preds = %if.end
-  ret void
-}
diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll
deleted file mode 100644
--- a/polly/test/GPGPU/privatization.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: checkPrivatization
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-;
-;
-;    void checkPrivatization(int A[], int B[], int C[], int control) {
-;      int x;
-;    #pragma scop
-;      for (int i = 0; i < 1000; i++) {
-;        x = 0;
-;        if (control)
-;          x += C[i];
-;
-;        B[i] = x * A[i];
-;      }
-;    #pragma endscop
-;    }
-;
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @checkPrivatization(ptr %A, ptr %B, ptr %C, i32 %control) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %if.end
-  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
-  %tobool = icmp eq i32 %control, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
-  %tmp4 = load i32, ptr %arrayidx, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %for.body, %if.then
-  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %tmp9 = load i32, ptr %arrayidx2, align 4
-  %mul = mul nsw i32 %tmp9, %x.0
-  %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
-  store i32 %mul, ptr %arrayidx4, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.end:                                          ; preds = %if.end
-  ret void
-}
diff --git a/polly/test/GPGPU/region-stmt.ll b/polly/test/GPGPU/region-stmt.ll
deleted file mode 100644
--- a/polly/test/GPGPU/region-stmt.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (128) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(4);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_B);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (128) * sizeof(float), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_for_body__TO__if_end(32 * b0 + t0);
-
-; IR: @polly_initContext
-
-; KERNEL-IR: kernel_0
-
-; REQUIRES: pollyacc
-
-;    void foo(float A[], float B[]) {
-;      for (long i = 0; i < 128; i++)
-;        if (A[i] == 42)
-;          B[i] += 2 * i;
-;        else
-;          B[i] += 4 * i;
-;    }
-;
-source_filename = "/tmp/test.c"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
-  %exitcond = icmp ne i64 %i.0, 128
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp oeq float %tmp, 4.200000e+01
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %for.body
-  %mul = shl nsw i64 %i.0, 1
-  %conv = sitofp i64 %mul to float
-  %arrayidx2 = getelementptr inbounds float, ptr %B, i64 %i.0
-  %tmp1 = load float, ptr %arrayidx2, align 4
-  %add = fadd float %tmp1, %conv
-  store float %add, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %for.body
-  %mul3 = shl nsw i64 %i.0, 2
-  %conv4 = sitofp i64 %mul3 to float
-  %arrayidx5 = getelementptr inbounds float, ptr %B, i64 %i.0
-  %tmp2 = load float, ptr %arrayidx5, align 4
-  %add6 = fadd float %tmp2, %conv4
-  store float %add6, ptr %arrayidx5, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %inc = add nuw nsw i64 %i.0, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll
deleted file mode 100644
--- a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4
-; KERNEL-IR-NEXT: br label %polly.merge
-
-define void @kernel_dynprog(ptr %sum_c) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %entry
-  br label %for.body3
-
-for.cond1.loopexit:                               ; preds = %for.end
-  %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1
-  %exitcond57 = icmp ne i64 %indvars.iv.next56, 49
-  br i1 %exitcond57, label %for.body3, label %for.inc55
-
-for.body3:                                        ; preds = %for.cond1.loopexit, %for.cond1.preheader
-  %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ]
-  %indvars.iv48 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next49, %for.cond1.loopexit ]
-  %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1
-  %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv48, i64 %indvars.iv55
-  store i32 0, ptr %arrayidx10, align 4
-  %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv48
-  br label %for.end
-
-for.end:                                          ; preds = %for.body3
-  br label %for.cond1.loopexit
-
-for.inc55:                                        ; preds = %for.cond1.loopexit
-  ret void
-}
diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll
deleted file mode 100644
--- a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; REQUIRES: pollyacc
-
-; Ensure that no dead instructions are emitted between the store and the
-; branch instruction of the ScopStmt. At some point, our dead-code-elimination
-; did not remove code that was inserted to compute the old (unused) branch
-; condition. This code referred to CPU registers and consequently resulted
-; in invalid bitcode.
-
-; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4
-; KERNEL-IR-NEXT: br label %polly.merge
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @kernel_dynprog(ptr %sum_c) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %entry
-  br label %for.body3
-
-for.cond4.for.cond1.loopexit_crit_edge:           ; preds = %for.end
-  br label %for.cond1.loopexit
-
-for.cond1.loopexit:                               ; preds = %for.cond4.for.cond1.loopexit_crit_edge
-  br i1 undef, label %for.body3, label %for.inc55
-
-for.body3:                                        ; preds = %for.cond1.loopexit, %for.cond1.preheader
-  %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ]
-  %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.end, %for.body3
-  %indvars.iv50 = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next51, %for.end ]
-  %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 %indvars.iv55
-  store i32 0, ptr %arrayidx10, align 4
-  %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv50
-  br i1 %cmp1334, label %for.body14.lr.ph, label %for.end
-
-for.body14.lr.ph:                                 ; preds = %for.body6
-  br label %for.body14
-
-for.body14:                                       ; preds = %for.body14, %for.body14.lr.ph
-  %arrayidx32 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 0
-  br i1 false, label %for.body14, label %for.cond12.for.end_crit_edge
-
-for.cond12.for.end_crit_edge:                     ; preds = %for.body14
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond12.for.end_crit_edge, %for.body6
-  %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1
-  %lftr.wideiv53 = trunc i64 %indvars.iv.next51 to i32
-  %exitcond54 = icmp ne i32 %lftr.wideiv53, 50
-  br i1 %exitcond54, label %for.body6, label %for.cond4.for.cond1.loopexit_crit_edge
-
-for.inc55:                                        ; preds = %for.cond1.loopexit
-  unreachable
-}
diff --git a/polly/test/GPGPU/run-time-check.ll b/polly/test/GPGPU/run-time-check.ll
deleted file mode 100644
--- a/polly/test/GPGPU/run-time-check.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-;
-; REQUIRES: pollyacc
-;
-;    void foo(long n, float A[][32]) {
-;      for (long i = 0; i < n; i++)
-;        for (long j = 0; j < n; j++)
-;          A[i][j] += A[i + 1][j + 1];
-;    }
-
-; IR:       %tmp = icmp slt i64 %i.0, %n
-; IR-NEXT:  br i1 %tmp, label %bb2, label %polly.merge_new_and_old
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(i64 %n, ptr %A) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb15, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp16, %bb15 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb17
-
-bb2:                                              ; preds = %bb1
-  br label %bb3
-
-bb3:                                              ; preds = %bb12, %bb2
-  %j.0 = phi i64 [ 0, %bb2 ], [ %tmp13, %bb12 ]
-  %exitcond = icmp ne i64 %j.0, %n
-  br i1 %exitcond, label %bb4, label %bb14
-
-bb4:                                              ; preds = %bb3
-  %tmp5 = add nuw nsw i64 %j.0, 1
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  %tmp7 = getelementptr inbounds [32 x float], ptr %A, i64 %tmp6, i64 %tmp5
-  %tmp8 = load float, ptr %tmp7, align 4
-  %tmp9 = getelementptr inbounds [32 x float], ptr %A, i64 %i.0, i64 %j.0
-  %tmp10 = load float, ptr %tmp9, align 4
-  %tmp11 = fadd float %tmp10, %tmp8
-  store float %tmp11, ptr %tmp9, align 4
-  br label %bb12
-
-bb12:                                             ; preds = %bb4
-  %tmp13 = add nuw nsw i64 %j.0, 1
-  br label %bb3
-
-bb14:                                             ; preds = %bb3
-  br label %bb15
-
-bb15:                                             ; preds = %bb14
-  %tmp16 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb17:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll b/polly/test/GPGPU/scalar-param-and-value-32-bit.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-;
-;    void foo(float A[], int n) {
-;      for (long j = 0; j < n; j++)
-;        A[j + n] += 42;
-;    }
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i32 %n)
-
-define void @foo(ptr %A, i32 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb9, %bb
-  %j.0 = phi i64 [ 0, %bb ], [ %tmp10, %bb9 ]
-  %tmp = sext i32 %n to i64
-  %tmp2 = icmp slt i64 %j.0, %tmp
-  br i1 %tmp2, label %bb3, label %bb11
-
-bb3:                                              ; preds = %bb1
-  %tmp4 = sext i32 %n to i64
-  %tmp5 = add nsw i64 %j.0, %tmp4
-  %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5
-  %tmp7 = load float, ptr %tmp6, align 4
-  %tmp8 = fadd float %tmp7, 4.200000e+01
-  store float %tmp8, ptr %tmp6, align 4
-  br label %bb9
-
-bb9:                                              ; preds = %bb3
-  %tmp10 = add nuw nsw i64 %j.0, 1
-  br label %bb1
-
-bb11:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-param-and-value-use.ll b/polly/test/GPGPU/scalar-param-and-value-use.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-param-and-value-use.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-;    void foo(long n, float A[][n]) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 32; j++)
-;          A[i][j] += A[i + 1][j + 1];
-;    }
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; This test case failed at some point as %n was only available in this kernel
-; when referenced through an isl_id in an isl ast expression, but not when
-; it was referenced from a SCEV  or instruction that not part of any loop
-; bound.
-
-; IR: %polly.access.mul.MemRef_A = mul nsw i64 {{.*}}, %n
-
-define void @foo(i64 %n, ptr %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb19, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp20, %bb19 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb21
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb16, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp17, %bb16 ]
-  %exitcond = icmp ne i64 %j.0, 32
-  br i1 %exitcond, label %bb5, label %bb18
-
-bb5:                                              ; preds = %bb4
-  %tmp = add nuw nsw i64 %j.0, 1
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  %tmp7 = mul nsw i64 %tmp6, %n
-  %tmp8 = getelementptr inbounds float, ptr %A, i64 %tmp7
-  %tmp9 = getelementptr inbounds float, ptr %tmp8, i64 %tmp
-  %tmp10 = load float, ptr %tmp9, align 4
-  %tmp11 = mul nsw i64 %i.0, %n
-  %tmp12 = getelementptr inbounds float, ptr %A, i64 %tmp11
-  %tmp13 = getelementptr inbounds float, ptr %tmp12, i64 %j.0
-  %tmp14 = load float, ptr %tmp13, align 4
-  %tmp15 = fadd float %tmp14, %tmp10
-  store float %tmp15, ptr %tmp13, align 4
-  br label %bb16
-
-bb16:                                             ; preds = %bb5
-  %tmp17 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb18:                                             ; preds = %bb4
-  br label %bb19
-
-bb19:                                             ; preds = %bb18
-  %tmp20 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb21:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-fp128.ll b/polly/test/GPGPU/scalar-parameter-fp128.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-fp128.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "LowerFormalArguments didn't emit the correct number of values!"
-
-;    void foo(fp128 A[], fp128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @fp128(ptr %A, fp128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0
-  %tmp3 = load fp128, ptr %tmp, align 4
-  %tmp4 = fadd fp128 %tmp3, %b
-  store fp128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-half.ll b/polly/test/GPGPU/scalar-parameter-half.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-half.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-;    void foo(half A[], half b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @half(ptr %A, half %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds half, ptr %A, i64 %i.0
-  %tmp3 = load half, ptr %tmp, align 4
-  %tmp4 = fadd half %tmp3, %b
-  store half %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-i120.ll b/polly/test/GPGPU/scalar-parameter-i120.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-i120.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits"
-
-;    void foo(i120 A[], i120 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i120(ptr %A, i120 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i120 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i120, ptr %A, i120 %i.0
-  %tmp3 = load i120, ptr %tmp, align 4
-  %tmp4 = add i120 %tmp3, %b
-  store i120 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i120 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-i128.ll b/polly/test/GPGPU/scalar-parameter-i128.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-i128.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-;    void foo(i128 A[], i128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i128(ptr %A, i128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i128 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i128, ptr %A, i128 %i.0
-  %tmp3 = load i128, ptr %tmp, align 4
-  %tmp4 = add i128 %tmp3, %b
-  store i128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i128 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-i3000.ll b/polly/test/GPGPU/scalar-parameter-i3000.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-i3000.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits"
-
-;    void foo(i3000 A[], i3000 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i3000(ptr %A, i3000 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i3000 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i3000, ptr %A, i3000 %i.0
-  %tmp3 = load i3000, ptr %tmp, align 4
-  %tmp4 = add i3000 %tmp3, %b
-  store i3000 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i3000 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-i80.ll b/polly/test/GPGPU/scalar-parameter-i80.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-i80.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits"
-
-;    void foo(i80 A[], i80 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i80(ptr %A, i80 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i80 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i80, ptr %A, i80 %i.0
-  %tmp3 = load i80, ptr %tmp, align 4
-  %tmp4 = add i80 %tmp3, %b
-  store i80 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i80 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "LowerFormalArguments didn't emit the correct number of values!"
-
-;    void foo(fp128 A[], fp128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @ppc_fp128(ptr %A, ppc_fp128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds ppc_fp128, ptr %A, i64 %i.0
-  %tmp3 = load ppc_fp128, ptr %tmp, align 4
-  %tmp4 = fadd ppc_fp128 %tmp3, %b
-  store ppc_fp128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "LowerFormalArguments didn't emit the correct number of values!"
-
-;    void foo(fp128 A[], fp128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @fp128(ptr %A, fp128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0
-  %tmp3 = load fp128, ptr %tmp, align 4
-  %tmp4 = fadd fp128 %tmp3, %b
-  store fp128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-parameter.ll
+++ /dev/null
@@ -1,411 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today due to extensive output differences from when the test was written.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, float %MemRef_b)
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(float A[], float b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @float(ptr %A, float %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp3 = load float, ptr %tmp, align 4
-  %tmp4 = fadd float %tmp3, %b
-  store float %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, double %MemRef_b)
-; KERNEL-NEXT: entry:
-; KERNEL-NEXT:   %b.s2a = alloca double
-; KERNEL-NEXT:   store double %MemRef_b, ptr %b.s2a
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(double A[], double b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @double(ptr %A, double %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds double, ptr %A, i64 %i.0
-  %tmp3 = load double, ptr %tmp, align 4
-  %tmp4 = fadd double %tmp3, %b
-  store double %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i1), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i1 A[], i1 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i1(ptr %A, i1 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i1, ptr %A, i64 %i.0
-  %tmp3 = load i1, ptr %tmp, align 4
-  %tmp4 = add i1 %tmp3, %b
-  store i1 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i3 A[], i3 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i3(ptr %A, i3 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i3, ptr %A, i64 %i.0
-  %tmp3 = load i3, ptr %tmp, align 4
-  %tmp4 = add i3 %tmp3, %b
-  store i3 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i8), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i8), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i8 A[], i32 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i8(ptr %A, i8 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i8, ptr %A, i64 %i.0
-  %tmp3 = load i8, ptr %tmp, align 4
-  %tmp4 = add i8 %tmp3, %b
-  store i8 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; IR-LABEL: @i8
-
-; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A)
-; IR-NEXT: store ptr [[REGA:%.+]], ptr %polly_launch_0_param_0
-; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params
-; IR-NEXT: store i8 %b, ptr %polly_launch_0_param_1
-; IR-NEXT: [[REGD:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1
-; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGD]]
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i32), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i32 A[], i32 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i32(ptr %A, i32 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i32, ptr %A, i64 %i.0
-  %tmp3 = load i32, ptr %tmp, align 4
-  %tmp4 = add i32 %tmp3, %b
-  store i32 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i60), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i60), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i60 A[], i60 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i60(ptr %A, i60 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i60, ptr %A, i64 %i.0
-  %tmp3 = load i60, ptr %tmp, align 4
-  %tmp4 = add i60 %tmp3, %b
-  store i60 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i64 A[], i64 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i64(ptr %A, i64 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 4
-  %tmp4 = add i64 %tmp3, %b
-  store i64 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll b/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: opt %loadPolly -polly-acc-dump-code -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-code -polly-stmt-granularity=bb \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting -polly-stmt-granularity=bb < %s \
-; RUN: | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             { Stmt_loop_a[i0] -> MemRef_p[0] };
-; SCOP-NEXT:         Execution Context: {  :  }
-; SCOP-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: {
-; CODE-NEXT:   if (32 * b0 + t0 <= 1025) {
-; CODE-NEXT:     Stmt_loop(32 * b0 + t0);
-; CODE-NEXT:     write(0);
-; CODE-NEXT:   }
-; CODE-NEXT:   sync0();
-; CODE-NEXT: }
-
-; Check that we generate a correct "always false" branch.
-; HOST-IR:  br i1 false, label %polly.start, label %loop.pre_entry_bb
-
-; This test case checks that we generate correct code if PPCGCodeGeneration
-; decides a build is unsuccessful with invariant load hoisting enabled.
-;
-; There is a conditional branch which switches between the original code and
-; the new code. We try to set this conditional branch to branch on false.
-; However, invariant load hoisting changes the structure of the scop, so we
-; need to change the way we *locate* this instruction.
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-
-define void @foo(ptr %A, ptr %p) {
-entry:
-  br label %loop
-
-loop:
-  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
-  %indvar.next = add i64 %indvar, 1
-  %invariant = load float, ptr %p
-  %ptr = getelementptr float, ptr %A, i64 %indvar
-  store float 42.0, ptr %ptr
-  %cmp = icmp sle i64 %indvar, 1024
-  br i1 %cmp, label %loop, label %loop2
-
-loop2:
-  %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2]
-  %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2]
-  %indvar2.next = add i64 %indvar2, 1
-  store float %indvar2f, ptr %A
-  %cmp2 = icmp sle i64 %indvar2, 1024
-  br i1 %cmp2, label %loop2, label %end
-
-end:
-  ret void
-}
diff --git a/polly/test/GPGPU/scheduler-timeout.ll b/polly/test/GPGPU/scheduler-timeout.ll
deleted file mode 100644
--- a/polly/test/GPGPU/scheduler-timeout.ll
+++ /dev/null
@@ -1,174 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; This test case took at some point forever to schedule, as the isl scheduler
-; seems to have problems if domain constraints appear in the dependences
-; provided to the scheduler.
-
-;   /* D := alpha*A*B*C + beta*D */
-;   for (i = 0; i < _PB_NI; i++)
-;     for (j = 0; j < _PB_NJ; j++)
-;       {
-;   tmp[i][j] = 0;
-;   for (k = 0; k < _PB_NK; ++k)
-;     tmp[i][j] += alpha * A[i][k] * B[k][j];
-;       }
-;   for (i = 0; i < _PB_NI; i++)
-;     for (j = 0; j < _PB_NL; j++)
-;       {
-;   D[i][j] *= beta;
-;   for (k = 0; k < _PB_NJ; ++k)
-;     D[i][j] += tmp[i][k] * C[k][j];
-;       }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(16, 32);
-; CODE-NEXT:     dim3 k0_dimGrid(128, 128);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   {
-; CODE-NEXT:     dim3 k1_dimBlock(16, 32);
-; CODE-NEXT:     dim3 k1_dimGrid(128, 128);
-; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1)
-; CODE-NEXT:   for (int c4 = 0; c4 <= 1; c4 += 1) {
-; CODE-NEXT:     if (c2 == 0)
-; CODE-NEXT:       Stmt_for_body6(32 * b0 + t0, 32 * b1 + t1 + 16 * c4);
-; CODE-NEXT:     for (int c5 = 0; c5 <= 31; c5 += 1)
-; CODE-NEXT:       Stmt_for_body11(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5);
-; CODE-NEXT:   }
-
-; CODE: # kernel1
-; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1)
-; CODE-NEXT:   for (int c4 = 0; c4 <= 1; c4 += 1) {
-; CODE-NEXT:     if (c2 == 0)
-; CODE-NEXT:       Stmt_for_body36(32 * b0 + t0, 32 * b1 + t1 + 16 * c4);
-; CODE-NEXT:     for (int c5 = 0; c5 <= 31; c5 += 1)
-; CODE-NEXT:       Stmt_for_body44(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5);
-; CODE-NEXT:   }
-
-
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #0
-
-; Function Attrs: nounwind uwtable
-define internal void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, float %alpha, float %beta, ptr %tmp, ptr %A, ptr %B, ptr %C, ptr %D) #1 {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry.split, %for.inc28
-  %indvars.iv19 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next20, %for.inc28 ]
-  br label %for.body6
-
-for.cond31.preheader:                             ; preds = %for.inc28
-  br label %for.cond34.preheader
-
-for.body6:                                        ; preds = %for.cond4.preheader, %for.inc25
-  %indvars.iv16 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next17, %for.inc25 ]
-  %arrayidx8 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16
-  store float 0.000000e+00, ptr %arrayidx8, align 4, !tbaa !1
-  br label %for.body11
-
-for.body11:                                       ; preds = %for.body6, %for.body11
-  %indvars.iv13 = phi i64 [ 0, %for.body6 ], [ %indvars.iv.next14, %for.body11 ]
-  %arrayidx15 = getelementptr inbounds [4096 x float], ptr %A, i64 %indvars.iv19, i64 %indvars.iv13
-  %tmp22 = load float, ptr %arrayidx15, align 4, !tbaa !1
-  %mul = fmul float %tmp22, %alpha
-  %arrayidx19 = getelementptr inbounds [4096 x float], ptr %B, i64 %indvars.iv13, i64 %indvars.iv16
-  %tmp23 = load float, ptr %arrayidx19, align 4, !tbaa !1
-  %mul20 = fmul float %mul, %tmp23
-  %arrayidx24 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16
-  %tmp24 = load float, ptr %arrayidx24, align 4, !tbaa !1
-  %add = fadd float %tmp24, %mul20
-  store float %add, ptr %arrayidx24, align 4, !tbaa !1
-  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
-  %exitcond15 = icmp ne i64 %indvars.iv.next14, 4096
-  br i1 %exitcond15, label %for.body11, label %for.inc25
-
-for.inc25:                                        ; preds = %for.body11
-  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
-  %exitcond18 = icmp ne i64 %indvars.iv.next17, 4096
-  br i1 %exitcond18, label %for.body6, label %for.inc28
-
-for.inc28:                                        ; preds = %for.inc25
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond21 = icmp ne i64 %indvars.iv.next20, 4096
-  br i1 %exitcond21, label %for.cond4.preheader, label %for.cond31.preheader
-
-for.cond34.preheader:                             ; preds = %for.cond31.preheader, %for.inc65
-  %indvars.iv10 = phi i64 [ 0, %for.cond31.preheader ], [ %indvars.iv.next11, %for.inc65 ]
-  br label %for.body36
-
-for.body36:                                       ; preds = %for.cond34.preheader, %for.inc62
-  %indvars.iv7 = phi i64 [ 0, %for.cond34.preheader ], [ %indvars.iv.next8, %for.inc62 ]
-  %arrayidx40 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7
-  %tmp25 = load float, ptr %arrayidx40, align 4, !tbaa !1
-  %mul41 = fmul float %tmp25, %beta
-  store float %mul41, ptr %arrayidx40, align 4, !tbaa !1
-  br label %for.body44
-
-for.body44:                                       ; preds = %for.body36, %for.body44
-  %indvars.iv = phi i64 [ 0, %for.body36 ], [ %indvars.iv.next, %for.body44 ]
-  %arrayidx48 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv10, i64 %indvars.iv
-  %tmp26 = load float, ptr %arrayidx48, align 4, !tbaa !1
-  %arrayidx52 = getelementptr inbounds [4096 x float], ptr %C, i64 %indvars.iv, i64 %indvars.iv7
-  %tmp27 = load float, ptr %arrayidx52, align 4, !tbaa !1
-  %mul53 = fmul float %tmp26, %tmp27
-  %arrayidx57 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7
-  %tmp28 = load float, ptr %arrayidx57, align 4, !tbaa !1
-  %add58 = fadd float %tmp28, %mul53
-  store float %add58, ptr %arrayidx57, align 4, !tbaa !1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 4096
-  br i1 %exitcond, label %for.body44, label %for.inc62
-
-for.inc62:                                        ; preds = %for.body44
-  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
-  %exitcond9 = icmp ne i64 %indvars.iv.next8, 4096
-  br i1 %exitcond9, label %for.body36, label %for.inc65
-
-for.inc65:                                        ; preds = %for.inc62
-  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
-  %exitcond12 = icmp ne i64 %indvars.iv.next11, 4096
-  br i1 %exitcond12, label %for.cond34.preheader, label %for.end67
-
-for.end67:                                        ; preds = %for.inc65
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"float", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/shared-memory-scalar.ll b/polly/test/GPGPU/shared-memory-scalar.ll
deleted file mode 100644
--- a/polly/test/GPGPU/shared-memory-scalar.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-;    void add(float *A, float alpha) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 10; j++)
-;          A[i] += alpha;
-;    }
-
-; CODE:       read(t0);
-; CODE-NEXT:  sync0();
-; CODE-NEXT:  for (int c3 = 0; c3 <= 9; c3 += 1)
-; CODE-NEXT:    Stmt_bb5(t0, c3);
-; CODE-NEXT:  sync1();
-; CODE-NEXT:  write(t0);
-
-; This test case was intended to test code generation for scalars stored
-; in shared memory. However, after properly marking the scalar as read-only
-; the scalar is not stored any more in shared memory. We still leave this
-; test case as documentation if we every forget to mark scalars as read-only.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @add(ptr %A, float %alpha) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb8, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %j.0, 10
-  br i1 %exitcond, label %bb5, label %bb10
-
-bb5:                                              ; preds = %bb4
-  %tmp = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp6 = load float, ptr %tmp, align 4
-  %tmp7 = fadd float %tmp6, %alpha
-  store float %tmp7, ptr %tmp, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb5
-  %tmp9 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb10:                                             ; preds = %bb4
-  br label %bb11
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb13:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/shared-memory-two-dimensional.ll b/polly/test/GPGPU/shared-memory-two-dimensional.ll
deleted file mode 100644
--- a/polly/test/GPGPU/shared-memory-two-dimensional.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-;    void foo(float A[], float b[][8]) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 16; j++)
-;          for (long k = 0; k < 8; k++)
-;            A[i] += j * k * b[j][k];
-;    }
-
-
-; CODE:      # kernel0
-; CODE-NEXT: {
-; CODE-NEXT:   if (t0 <= 7)
-; CODE-NEXT:     for (int c0 = 0; c0 <= 15; c0 += 1)
-; CODE-NEXT:       read(c0, t0);
-; CODE-NEXT:   read(t0);
-; CODE-NEXT:   sync0();
-; CODE-NEXT:   for (int c3 = 0; c3 <= 15; c3 += 1)
-; CODE-NEXT:     for (int c4 = 0; c4 <= 7; c4 += 1)
-; CODE-NEXT:       Stmt_bb8(t0, c3, c4);
-; CODE-NEXT:   sync1();
-; CODE-NEXT:   write(t0);
-; CODE-NEXT: }
-
-; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4
-
-; KERNEL:        %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8
-; KERNEL-NEXT:   %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0
-; KERNEL-NEXT:   %polly.access.MemRef_b = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b
-; KERNEL-NEXT:   %shared.read = load float, float addrspace(1)* %polly.access.MemRef_b
-; KERNEL-NEXT:   store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(float* %A, [8 x float]* %b) {
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb22, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ]
-  %exitcond2 = icmp ne i64 %i.0, 32
-  br i1 %exitcond2, label %bb4, label %bb24
-
-bb4:                                              ; preds = %bb3
-  br label %bb5
-
-bb5:                                              ; preds = %bb19, %bb4
-  %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ]
-  %exitcond1 = icmp ne i64 %j.0, 16
-  br i1 %exitcond1, label %bb6, label %bb21
-
-bb6:                                              ; preds = %bb5
-  br label %bb7
-
-bb7:                                              ; preds = %bb16, %bb6
-  %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ]
-  %exitcond = icmp ne i64 %k.0, 8
-  br i1 %exitcond, label %bb8, label %bb18
-
-bb8:                                              ; preds = %bb7
-  %tmp = mul nuw nsw i64 %j.0, %k.0
-  %tmp9 = sitofp i64 %tmp to float
-  %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0
-  %tmp11 = load float, float* %tmp10, align 4
-  %tmp12 = fmul float %tmp9, %tmp11
-  %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp14 = load float, float* %tmp13, align 4
-  %tmp15 = fadd float %tmp14, %tmp12
-  store float %tmp15, float* %tmp13, align 4
-  br label %bb16
-
-bb16:                                             ; preds = %bb8
-  %tmp17 = add nuw nsw i64 %k.0, 1
-  br label %bb7
-
-bb18:                                             ; preds = %bb7
-  br label %bb19
-
-bb19:                                             ; preds = %bb18
-  %tmp20 = add nuw nsw i64 %j.0, 1
-  br label %bb5
-
-bb21:                                             ; preds = %bb5
-  br label %bb22
-
-bb22:                                             ; preds = %bb21
-  %tmp23 = add nuw nsw i64 %i.0, 1
-  br label %bb3
-
-bb24:                                             ; preds = %bb3
-  ret void
-}
diff --git a/polly/test/GPGPU/shared-memory.ll b/polly/test/GPGPU/shared-memory.ll
deleted file mode 100644
--- a/polly/test/GPGPU/shared-memory.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-;    void add(float *A) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 10; j++)
-;          A[i] += 1;
-;    }
-
-; CODE: # kernel0
-; CODE: {
-; CODE:   read(t0);
-; CODE:   sync0();
-; CODE:   for (int c3 = 0; c3 <= 9; c3 += 1)
-; CODE:     Stmt_bb5(t0, c3);
-; CODE:   sync1();
-; CODE:   write(t0);
-; CODE: }
-
-; KERNEL: @shared_MemRef_A = internal addrspace(3) global [32 x float] zeroinitializer, align 4
-
-; KERNEL:   %polly.access.shared_MemRef_A = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0
-; KERNEL-NEXT:   %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A
-; KERNEL-NEXT:   store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_A
-
-; KERNEL:   %polly.access.shared_MemRef_A3 = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A4 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A4, i64 %t0
-; KERNEL-NEXT:   %shared.write = load float, float addrspace(3)* %polly.access.shared_MemRef_A3
-; KERNEL-NEXT:   store float %shared.write, float addrspace(1)* %polly.access.MemRef_A5
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @add(float* %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb8, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %j.0, 10
-  br i1 %exitcond, label %bb5, label %bb10
-
-bb5:                                              ; preds = %bb4
-  %tmp = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp6 = load float, float* %tmp, align 4
-  %tmp7 = fadd float %tmp6, 1.000000e+00
-  store float %tmp7, float* %tmp, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb5
-  %tmp9 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb10:                                             ; preds = %bb4
-  br label %bb11
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb13:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/simple-managed-memory-rewrite.ll b/polly/test/GPGPU/simple-managed-memory-rewrite.ll
deleted file mode 100644
--- a/polly/test/GPGPU/simple-managed-memory-rewrite.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s |  FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 \
-; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP: i32 MemRef_A[*];
-
-; Check that we generate a constructor call for @A.toptr
-; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr {{.*}}, ptr @A.toptr }]
-
-; Check that we generate a constructor
-; 4 bytes * 100 = 400
-; HOST-IR: define void {{.*}}constructor() {
-; HOST-IR-NEXT: entry:
-; HOST-IR-NEXT:   %mem.raw = call ptr @polly_mallocManaged(i64 400)
-; HOST-IR-NEXT:   store ptr %mem.raw, ptr @A.toptr
-; HOST-IR-NEXT:   ret void
-; HOST-IR-NEXT: }
-
-; HOST-IR-NOT: @A
-
-source_filename = "test.c"
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-@A = internal global [100 x i32] zeroinitializer, align 16
-
-define void @f() {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [100 x i32], ptr @A, i64 0, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/size-cast.ll b/polly/test/GPGPU/size-cast.ll
deleted file mode 100644
--- a/polly/test/GPGPU/size-cast.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; REQUIRES: pollyacc
-
-; This test case ensures that we properly sign-extend the types we are using.
-
-; CODE:      if (arg >= 1 && arg1 == 0) {
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_arg3, MemRef_arg3, (arg) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(arg >= 1048545 ? 32768 : (arg + 31) / 32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_arg3, dev_MemRef_arg2, arg, arg1);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_arg2, dev_MemRef_arg2, (arg) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT  cudaCheckReturn(cudaFree(dev_MemRef_arg3));
-; CODE-NEXT  cudaCheckReturn(cudaFree(dev_MemRef_arg2));
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c0 = 0; c0 <= (arg - 32 * b0 - 1) / 1048576; c0 += 1)
-; CODE-NEXT:   if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
-; CODE-NEXT:     Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
-
-; IR-LABEL:  call ptr @polly_initContextCUDA()
-; IR:        sext i32 %arg to i64
-; IR-NEXT:   mul i64
-; IR-NEXT:   @polly_allocateMemoryForDevice
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @hoge(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3) {
-bb:
-  br label %bb4
-
-bb4:                                              ; preds = %bb13, %bb
-  br label %bb6
-
-bb5:                                              ; preds = %bb13
-  ret void
-
-bb6:                                              ; preds = %bb6, %bb4
-  %tmp = phi i64 [ 0, %bb4 ], [ %tmp10, %bb6 ]
-  %tmp7 = getelementptr inbounds double, ptr %arg3, i64 %tmp
-  %tmp8 = load double, ptr %tmp7, align 8
-  %tmp9 = getelementptr inbounds [1000 x double], ptr %arg2, i64 0, i64 %tmp
-  store double %tmp8, ptr %tmp9, align 8
-  %tmp10 = add nuw nsw i64 %tmp, 1
-  %tmp11 = zext i32 %arg to i64
-  %tmp12 = icmp ne i64 %tmp10, %tmp11
-  br i1 %tmp12, label %bb6, label %bb13
-
-bb13:                                             ; preds = %bb6
-  %tmp14 = zext i32 %arg1 to i64
-  %tmp15 = icmp ne i64 0, %tmp14
-  br i1 %tmp15, label %bb4, label %bb5
-}
diff --git a/polly/test/GPGPU/spir-codegen.ll b/polly/test/GPGPU/spir-codegen.ll
deleted file mode 100644
--- a/polly/test/GPGPU/spir-codegen.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-gpu-arch=spir32 \
-; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-; CHECK:      target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-; CHECK-NEXT: target triple = "spir-unknown-unknown"
-
-; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = call i32 @__gen_ocl_get_group_id0()
-; CHECK-NEXT:   %__gen_ocl_get_group_id0 = zext i32 %0 to i64
-; CHECK-NEXT:   %1 = call i32 @__gen_ocl_get_group_id1()
-; CHECK-NEXT:   %__gen_ocl_get_group_id1 = zext i32 %1 to i64
-; CHECK-NEXT:   %2 = call i32 @__gen_ocl_get_local_id0()
-; CHECK-NEXT:   %__gen_ocl_get_local_id0 = zext i32 %2 to i64
-; CHECK-NEXT:   %3 = call i32 @__gen_ocl_get_local_id1()
-; CHECK-NEXT:   %__gen_ocl_get_local_id1 = zext i32 %3 to i64
-; CHECK-NEXT:   br label %polly.loop_preheader
-
-; CHECK-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
-; CHECK-NEXT:   ret void
-
-; CHECK-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
-; CHECK-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
-; CHECK-NEXT:   %4 = mul nsw i64 32, %__gen_ocl_get_group_id0
-; CHECK-NEXT:   %5 = add nsw i64 %4, %__gen_ocl_get_local_id0
-; CHECK-NEXT:   %6 = mul nsw i64 32, %__gen_ocl_get_group_id1
-; CHECK-NEXT:   %7 = add nsw i64 %6, %__gen_ocl_get_local_id1
-; CHECK-NEXT:   %8 = mul nsw i64 16, %polly.indvar
-; CHECK-NEXT:   %9 = add nsw i64 %7, %8
-; CHECK-NEXT:   br label %polly.stmt.bb5
-
-; CHECK-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
-; CHECK-NEXT:   %10 = mul i64 %5, %9
-; CHECK-NEXT:   %p_tmp6 = sitofp i64 %10 to float
-; CHECK-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; CHECK-NEXT:   %11 = mul nsw i64 32, %__gen_ocl_get_group_id0
-; CHECK-NEXT:   %12 = add nsw i64 %11, %__gen_ocl_get_local_id0
-; CHECK-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
-; CHECK-NEXT:   %13 = mul nsw i64 32, %__gen_ocl_get_group_id1
-; CHECK-NEXT:   %14 = add nsw i64 %13, %__gen_ocl_get_local_id1
-; CHECK-NEXT:   %15 = mul nsw i64 16, %polly.indvar
-; CHECK-NEXT:   %16 = add nsw i64 %14, %15
-; CHECK-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
-; CHECK-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
-; CHECK-NEXT:   %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
-; CHECK-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
-; CHECK-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; CHECK-NEXT:   %17 = mul nsw i64 32, %__gen_ocl_get_group_id0
-; CHECK-NEXT:   %18 = add nsw i64 %17, %__gen_ocl_get_local_id0
-; CHECK-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
-; CHECK-NEXT:   %19 = mul nsw i64 32, %__gen_ocl_get_group_id1
-; CHECK-NEXT:   %20 = add nsw i64 %19, %__gen_ocl_get_local_id1
-; CHECK-NEXT:   %21 = mul nsw i64 16, %polly.indvar
-; CHECK-NEXT:   %22 = add nsw i64 %20, %21
-; CHECK-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
-; CHECK-NEXT:   %polly.access.MemRef_A4 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3
-; CHECK-NEXT:   store float %p_tmp9, float addrspace(1)* %polly.access.MemRef_A4, align 4
-; CHECK-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; CHECK-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1
-; CHECK-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; CHECK-LABEL: polly.loop_preheader:                             ; preds = %entry
-; CHECK-NEXT:   br label %polly.loop_header
-
-; CHECK: attributes #0 = { "polly.skip.fn" }
-
-;    void double_parallel_loop(float A[][1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        for (long j = 0; j < 1024; j++)
-;          A[i][j] += i * j;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @double_parallel_loop([1024 x float]* %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb13, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
-  %exitcond1 = icmp ne i64 %i.0, 1024
-  br i1 %exitcond1, label %bb3, label %bb15
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb10, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %j.0, 1024
-  br i1 %exitcond, label %bb5, label %bb12
-
-bb5:                                              ; preds = %bb4
-  %tmp = mul nuw nsw i64 %i.0, %j.0
-  %tmp6 = sitofp i64 %tmp to float
-  %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0
-  %tmp8 = load float, float* %tmp7, align 4
-  %tmp9 = fadd float %tmp8, %tmp6
-  store float %tmp9, float* %tmp7, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb5
-  %tmp11 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12
-  %tmp14 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb15:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/spir-typesize.ll b/polly/test/GPGPU/spir-typesize.ll
deleted file mode 100644
--- a/polly/test/GPGPU/spir-typesize.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-gpu-arch=spir64 \
-; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
-; RUN: FileCheck -check-prefix=I64 %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-gpu-arch=spir32 \
-; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
-; RUN: FileCheck -check-prefix=I32 %s
-
-; REQUIRES: pollyacc
-
-; This test case checks whether the openCl runtime functions (get_local_id/get_group_id) return the right types for 32 and 64bit devices.
-
-; I32:      target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-; I32-NEXT: target triple = "spir-unknown-unknown"
-
-; I32-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
-; I32-NEXT: entry:
-; I32-NEXT:   %0 = call i32 @__gen_ocl_get_group_id0()
-; I32-NEXT:   %__gen_ocl_get_group_id0 = zext i32 %0 to i64
-; I32-NEXT:   %1 = call i32 @__gen_ocl_get_group_id1()
-; I32-NEXT:   %__gen_ocl_get_group_id1 = zext i32 %1 to i64
-; I32-NEXT:   %2 = call i32 @__gen_ocl_get_local_id0()
-; I32-NEXT:   %__gen_ocl_get_local_id0 = zext i32 %2 to i64
-; I32-NEXT:   %3 = call i32 @__gen_ocl_get_local_id1()
-; I32-NEXT:   %__gen_ocl_get_local_id1 = zext i32 %3 to i64
-; I32-NEXT:   br label %polly.loop_preheader
-
-; I64:       target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-; I64-next:  target triple = "spir64-unknown-unknown"
-
-; I64-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
-; I64-NEXT: entry:
-; I64-NEXT:   %0 = call i64 @__gen_ocl_get_group_id0()
-; I64-NEXT:   %1 = call i64 @__gen_ocl_get_group_id1()
-; I64-NEXT:   %2 = call i64 @__gen_ocl_get_local_id0()
-; I64-NEXT:   %3 = call i64 @__gen_ocl_get_local_id1()
-; I64-NEXT:   br label %polly.loop_preheader
-
-
-;    void double_parallel_loop(float A[][1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        for (long j = 0; j < 1024; j++)
-;          A[i][j] += i * j;
-;    }
-;
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @double_parallel_loop(ptr %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb13, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
-  %exitcond1 = icmp ne i64 %i.0, 1024
-  br i1 %exitcond1, label %bb3, label %bb15
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb10, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %j.0, 1024
-  br i1 %exitcond, label %bb5, label %bb12
-
-bb5:                                              ; preds = %bb4
-  %tmp = mul nuw nsw i64 %i.0, %j.0
-  %tmp6 = sitofp i64 %tmp to float
-  %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0
-  %tmp8 = load float, ptr %tmp7, align 4
-  %tmp9 = fadd float %tmp8, %tmp6
-  store float %tmp9, ptr %tmp7, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb5
-  %tmp11 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12
-  %tmp14 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb15:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll b/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll
deleted file mode 100644
--- a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s
-
-; Check that we do not create a kernel if there is an
-; unknown function call in a candidate kernel.
-
-; Check that we model the kernel as a scop.
-; SCOP:      Function: f
-; SCOP-NEXT:     Region: %entry.split---%for.end13
-
-; If a kernel were generated, then this code would have been part of the kernel
-; and not the `.ll` file that is generated.
-; CHECK:       %conv = fpext float %0 to double
-; CHECK-NEXT:  %1 = tail call double @extern.fn(double %conv)
-; CHECK-NEXT:  %conv6 = fptrunc double %1 to float
-
-; REQUIRES: pollyacc
-
-; static const int N = 1000;
-; void f(float A[N][N], int n, float B[N][N]) {
-;   for(int i = 0; i < n; i++) {
-;     for(int j = 0; j < n; j++) {
-;       B[i][j] = extern_fn(A[i][j], 3);
-;     }
-;
-;   }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
-define void @f(ptr %A, i32 %n, ptr %B) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp3 = icmp sgt i32 %n, 0
-  br i1 %cmp3, label %for.cond1.preheader.lr.ph, label %for.end13
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry.split
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.inc11
-  %indvars.iv5 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next6, %for.inc11 ]
-  %cmp21 = icmp sgt i32 %n, 0
-  br i1 %cmp21, label %for.body3.lr.ph, label %for.inc11
-
-for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3.lr.ph, %for.body3
-  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [1000 x float], ptr %A, i64 %indvars.iv5, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx5, align 4
-  %conv = fpext float %0 to double
-  %1 = tail call double @extern.fn(double %conv)
-  %conv6 = fptrunc double %1 to float
-  %arrayidx10 = getelementptr inbounds [1000 x float], ptr %B, i64 %indvars.iv5, i64 %indvars.iv
-  store float %conv6, ptr %arrayidx10, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %wide.trip.count = zext i32 %n to i64
-  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.body3, label %for.cond1.for.inc11_crit_edge
-
-for.cond1.for.inc11_crit_edge:                    ; preds = %for.body3
-  br label %for.inc11
-
-for.inc11:                                        ; preds = %for.cond1.for.inc11_crit_edge, %for.cond1.preheader
-  %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1
-  %wide.trip.count7 = zext i32 %n to i64
-  %exitcond8 = icmp ne i64 %indvars.iv.next6, %wide.trip.count7
-  br i1 %exitcond8, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge
-
-for.cond.for.end13_crit_edge:                     ; preds = %for.inc11
-  br label %for.end13
-
-for.end13:                                        ; preds = %for.cond.for.end13_crit_edge, %entry.split
-  ret void
-}
-
-declare double @extern.fn(double) #0
-attributes #0 = { readnone }
diff --git a/polly/test/GPGPU/untouched-arrays.ll b/polly/test/GPGPU/untouched-arrays.ll
deleted file mode 100644
--- a/polly/test/GPGPU/untouched-arrays.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_global_1, MemRef_global_1, (142) * sizeof(i32), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(10);
-; CODE-NEXT:     dim3 k0_dimGrid(1);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_global_1);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_global_1, dev_MemRef_global_1, (142) * sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE:   cudaCheckReturn(cudaFree(dev_MemRef_global_1));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb33(t0, 0);
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.hoge = type { [23 x i16], [22 x i16], [14 x i16], [13 x i16] }
-
-@global = external global [9 x %struct.hoge], align 16
-@global.1 = external global [9 x [152 x i32]], align 16
-
-; Function Attrs: nounwind uwtable
-define void @widget() #0 {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb1, %bb
-  br i1 undef, label %bb1, label %bb2
-
-bb2:                                              ; preds = %bb2, %bb1
-  br i1 undef, label %bb2, label %bb3
-
-bb3:                                              ; preds = %bb3, %bb2
-  br i1 undef, label %bb3, label %bb4
-
-bb4:                                              ; preds = %bb4, %bb3
-  br i1 undef, label %bb4, label %bb5
-
-bb5:                                              ; preds = %bb5, %bb4
-  br i1 undef, label %bb5, label %bb6
-
-bb6:                                              ; preds = %bb6, %bb5
-  br i1 undef, label %bb6, label %bb7
-
-bb7:                                              ; preds = %bb7, %bb6
-  br i1 undef, label %bb7, label %bb8
-
-bb8:                                              ; preds = %bb8, %bb7
-  br i1 undef, label %bb8, label %bb9
-
-bb9:                                              ; preds = %bb8
-  br label %bb10
-
-bb10:                                             ; preds = %bb12, %bb9
-  br label %bb11
-
-bb11:                                             ; preds = %bb11, %bb10
-  br i1 undef, label %bb11, label %bb12
-
-bb12:                                             ; preds = %bb11
-  br i1 undef, label %bb10, label %bb13
-
-bb13:                                             ; preds = %bb18, %bb12
-  br i1 undef, label %bb16, label %bb14
-
-bb14:                                             ; preds = %bb16, %bb13
-  br i1 undef, label %bb15, label %bb18
-
-bb15:                                             ; preds = %bb14
-  br label %bb17
-
-bb16:                                             ; preds = %bb16, %bb13
-  br i1 undef, label %bb16, label %bb14
-
-bb17:                                             ; preds = %bb17, %bb15
-  br i1 undef, label %bb17, label %bb18
-
-bb18:                                             ; preds = %bb17, %bb14
-  br i1 undef, label %bb13, label %bb19
-
-bb19:                                             ; preds = %bb25, %bb18
-  br label %bb20
-
-bb20:                                             ; preds = %bb24, %bb19
-  br i1 undef, label %bb21, label %bb24
-
-bb21:                                             ; preds = %bb20
-  br i1 undef, label %bb23, label %bb22
-
-bb22:                                             ; preds = %bb21
-  br label %bb24
-
-bb23:                                             ; preds = %bb21
-  br label %bb24
-
-bb24:                                             ; preds = %bb23, %bb22, %bb20
-  br i1 undef, label %bb20, label %bb25
-
-bb25:                                             ; preds = %bb24
-  br i1 undef, label %bb19, label %bb26
-
-bb26:                                             ; preds = %bb56, %bb25
-  %tmp = phi ptr [ undef, %bb56 ], [ getelementptr inbounds ([9 x [152 x i32]], ptr @global.1, i64 0, i64 0, i64 32), %bb25 ]
-  br label %bb27
-
-bb27:                                             ; preds = %bb27, %bb26
-  br i1 undef, label %bb27, label %bb28
-
-bb28:                                             ; preds = %bb27
-  br label %bb30
-
-bb30:                                             ; preds = %bb38, %bb28
-  %tmp31 = phi i32 [ 3, %bb28 ], [ %tmp40, %bb38 ]
-  %tmp32 = phi ptr [ %tmp, %bb28 ], [ %tmp39, %bb38 ]
-  br label %bb33
-
-bb33:                                             ; preds = %bb33, %bb30
-  %tmp34 = phi i32 [ 0, %bb30 ], [ %tmp37, %bb33 ]
-  %tmp35 = phi ptr [ %tmp32, %bb30 ], [ undef, %bb33 ]
-  %tmp36 = getelementptr inbounds i32, ptr %tmp35, i64 1
-  store i32 undef, ptr %tmp36, align 4, !tbaa !1
-  %tmp37 = add nuw nsw i32 %tmp34, 1
-  br i1 false, label %bb33, label %bb38
-
-bb38:                                             ; preds = %bb33
-  %tmp39 = getelementptr i32, ptr %tmp32, i64 12
-  %tmp40 = add nuw nsw i32 %tmp31, 1
-  %tmp41 = icmp ne i32 %tmp40, 13
-  br i1 %tmp41, label %bb30, label %bb42
-
-bb42:                                             ; preds = %bb38
-  %tmp43 = getelementptr inbounds [9 x %struct.hoge], ptr @global, i64 0, i64 0, i32 3, i64 0
-  br label %bb44
-
-bb44:                                             ; preds = %bb51, %bb42
-  %tmp45 = phi i32 [ 0, %bb42 ], [ %tmp52, %bb51 ]
-  %tmp46 = phi ptr [ %tmp43, %bb42 ], [ undef, %bb51 ]
-  %tmp47 = load i16, ptr %tmp46, align 2, !tbaa !5
-  br label %bb48
-
-bb48:                                             ; preds = %bb48, %bb44
-  %tmp49 = phi i32 [ 0, %bb44 ], [ %tmp50, %bb48 ]
-  %tmp50 = add nuw nsw i32 %tmp49, 1
-  br i1 false, label %bb48, label %bb51
-
-bb51:                                             ; preds = %bb48
-  %tmp52 = add nuw nsw i32 %tmp45, 1
-  %tmp53 = icmp ne i32 %tmp52, 13
-  br i1 %tmp53, label %bb44, label %bb54
-
-bb54:                                             ; preds = %bb51
-  br label %bb55
-
-bb55:                                             ; preds = %bb55, %bb54
-  br i1 undef, label %bb55, label %bb56
-
-bb56:                                             ; preds = %bb55
-  br i1 undef, label %bb26, label %bb57
-
-bb57:                                             ; preds = %bb60, %bb56
-  br label %bb58
-
-bb58:                                             ; preds = %bb58, %bb57
-  br i1 undef, label %bb58, label %bb59
-
-bb59:                                             ; preds = %bb59, %bb58
-  br i1 undef, label %bb59, label %bb60
-
-bb60:                                             ; preds = %bb59
-  br i1 undef, label %bb57, label %bb61
-
-bb61:                                             ; preds = %bb65, %bb60
-  br label %bb62
-
-bb62:                                             ; preds = %bb64, %bb61
-  br label %bb63
-
-bb63:                                             ; preds = %bb63, %bb62
-  br i1 undef, label %bb63, label %bb64
-
-bb64:                                             ; preds = %bb63
-  br i1 undef, label %bb62, label %bb65
-
-bb65:                                             ; preds = %bb64
-  br i1 undef, label %bb61, label %bb66
-
-bb66:                                             ; preds = %bb70, %bb65
-  br label %bb67
-
-bb67:                                             ; preds = %bb69, %bb66
-  br label %bb68
-
-bb68:                                             ; preds = %bb68, %bb67
-  br i1 undef, label %bb68, label %bb69
-
-bb69:                                             ; preds = %bb68
-  br i1 undef, label %bb67, label %bb70
-
-bb70:                                             ; preds = %bb69
-  br i1 undef, label %bb66, label %bb71
-
-bb71:                                             ; preds = %bb73, %bb70
-  br label %bb72
-
-bb72:                                             ; preds = %bb72, %bb71
-  br i1 undef, label %bb72, label %bb73
-
-bb73:                                             ; preds = %bb72
-  br i1 undef, label %bb71, label %bb74
-
-bb74:                                             ; preds = %bb80, %bb73
-  br label %bb75
-
-bb75:                                             ; preds = %bb79, %bb74
-  br label %bb76
-
-bb76:                                             ; preds = %bb78, %bb75
-  br label %bb77
-
-bb77:                                             ; preds = %bb77, %bb76
-  br i1 undef, label %bb77, label %bb78
-
-bb78:                                             ; preds = %bb77
-  br i1 undef, label %bb76, label %bb79
-
-bb79:                                             ; preds = %bb78
-  br i1 undef, label %bb75, label %bb80
-
-bb80:                                             ; preds = %bb79
-  br i1 undef, label %bb74, label %bb81
-
-bb81:                                             ; preds = %bb85, %bb80
-  br label %bb82
-
-bb82:                                             ; preds = %bb84, %bb81
-  br label %bb83
-
-bb83:                                             ; preds = %bb83, %bb82
-  br i1 undef, label %bb83, label %bb84
-
-bb84:                                             ; preds = %bb83
-  br i1 undef, label %bb82, label %bb85
-
-bb85:                                             ; preds = %bb84
-  br i1 undef, label %bb81, label %bb86
-
-bb86:                                             ; preds = %bb85
-  ret void
-}
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 4.0.0"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"int", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
-!5 = !{!6, !6, i64 0}
-!6 = !{!"short", !3, i64 0}
diff --git a/polly/test/Unit/lit.site.cfg.in b/polly/test/Unit/lit.site.cfg.in
--- a/polly/test/Unit/lit.site.cfg.in
+++ b/polly/test/Unit/lit.site.cfg.in
@@ -11,7 +11,6 @@
 config.polly_lib_dir = "@POLLY_LIB_DIR@"
 config.shlibdir = "@SHLIBDIR@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
-config.enable_gpgpu_codegen = "@GPU_CODEGEN@"
 config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@"
 config.has_unittests = @POLLY_GTEST_AVAIL@
 
diff --git a/polly/test/lit.cfg b/polly/test/lit.cfg
--- a/polly/test/lit.cfg
+++ b/polly/test/lit.cfg
@@ -70,6 +70,4 @@
     print("Could not find llvm-config in " + config.llvm_tools_dir)
     exit(42)
 
-if re.search(r'NVPTX', llvm_config_cmd.stdout.read().decode('ascii')):
-    config.available_features.add('nvptx-registered-target')
 llvm_config_cmd.wait()
diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in
--- a/polly/test/lit.site.cfg.in
+++ b/polly/test/lit.site.cfg.in
@@ -7,7 +7,6 @@
 config.polly_obj_root = "@POLLY_BINARY_DIR@"
 config.polly_lib_dir = "@POLLY_LIB_DIR@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
-config.enable_gpgpu_codegen = "@GPU_CODEGEN@"
 config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@"
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";")
@@ -50,9 +49,6 @@
     config.substitutions.append(('%loadNPMPolly', commonOpts ))
 
 
-if config.enable_gpgpu_codegen == 'TRUE' :
-    config.available_features.add('pollyacc')
-
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/polly/tools/CMakeLists.txt b/polly/tools/CMakeLists.txt
deleted file mode 100644
--- a/polly/tools/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-if (CUDA_FOUND OR OpenCL_FOUND)
-  add_subdirectory(GPURuntime)
-endif (CUDA_FOUND OR OpenCL_FOUND)
-
-set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
diff --git a/polly/tools/GPURuntime/CMakeLists.txt b/polly/tools/GPURuntime/CMakeLists.txt
deleted file mode 100644
--- a/polly/tools/GPURuntime/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-set(MODULE TRUE)
-set(LLVM_NO_RTTI 1)
-
-add_polly_library(GPURuntime
-  GPUJIT.c
-  )
-
-set_target_properties(GPURuntime
-  PROPERTIES
-  LINKER_LANGUAGE C
-  PREFIX "lib"
-  )
-
-set_property(TARGET GPURuntime PROPERTY C_STANDARD 99)
-
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=default ")
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-sanitize=all ")
-endif()
diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h
deleted file mode 100644
--- a/polly/tools/GPURuntime/GPUJIT.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/******************************************************************************/
-/*                                                                            */
-/* Part of the LLVM Project, under the Apache License v2.0 with LLVM          */
-/* Exceptions.                                                                */
-/* See https://llvm.org/LICENSE.txt for license information.                  */
-/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    */
-/*                                                                            */
-/******************************************************************************/
-/*                                                                            */
-/*  This file defines GPUJIT.                                                 */
-/*                                                                            */
-/******************************************************************************/
-
-#ifndef GPUJIT_H_
-#define GPUJIT_H_
-#include "stddef.h"
-
-/*
- * The following demonstrates how we can use the GPURuntime library to
- * execute a GPU kernel.
- *
- * char KernelString[] = "\n\
- *   .version 1.4\n\
- *   .target sm_10, map_f64_to_f32\n\
- *   .entry _Z8myKernelPi (\n\
- *   .param .u64 __cudaparm__Z8myKernelPi_data)\n\
- *   {\n\
- *     .reg .u16 %rh<4>;\n\
- *     .reg .u32 %r<5>;\n\
- *     .reg .u64 %rd<6>;\n\
- *     cvt.u32.u16     %r1, %tid.x;\n\
- *     mov.u16         %rh1, %ctaid.x;\n\
- *     mov.u16         %rh2, %ntid.x;\n\
- *     mul.wide.u16    %r2, %rh1, %rh2;\n\
- *     add.u32         %r3, %r1, %r2;\n\
- *     ld.param.u64    %rd1, [__cudaparm__Z8myKernelPi_data];\n\
- *     cvt.s64.s32     %rd2, %r3;\n\
- *     mul.wide.s32    %rd3, %r3, 4;\n\
- *     add.u64         %rd4, %rd1, %rd3;\n\
- *     st.global.s32   [%rd4+0], %r3;\n\
- *     exit;\n\
- *   }\n\
- * ";
- *
- * const char *Entry = "_Z8myKernelPi";
- *
- * int main() {
- *   PollyGPUFunction *Kernel;
- *   PollyGPUContext *Context;
- *   PollyGPUDevicePtr *DevArray;
- *   int *HostData;
- *   int MemSize;
- *
- *   int GridX = 8;
- *   int GridY = 8;
- *
- *   int BlockX = 16;
- *   int BlockY = 16;
- *   int BlockZ = 1;
- *
- *   MemSize = 256*64*sizeof(int);
- *   Context = polly_initContext();
- *   DevArray = polly_allocateMemoryForDevice(MemSize);
- *   Kernel = polly_getKernel(KernelString, KernelName);
- *
- *   void *Params[1];
- *   void *DevPtr = polly_getDevicePtr(DevArray)
- *   Params[0] = &DevPtr;
- *
- *   polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params);
- *
- *   polly_copyFromDeviceToHost(HostData, DevData, MemSize);
- *   polly_freeKernel(Kernel);
- *   polly_freeDeviceMemory(DevArray);
- *   polly_freeContext(Context);
- * }
- *
- */
-
-typedef enum PollyGPURuntimeT {
-  RUNTIME_NONE,
-  RUNTIME_CUDA,
-  RUNTIME_CL
-} PollyGPURuntime;
-
-typedef struct PollyGPUContextT PollyGPUContext;
-typedef struct PollyGPUFunctionT PollyGPUFunction;
-typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
-
-typedef struct OpenCLContextT OpenCLContext;
-typedef struct OpenCLKernelT OpenCLKernel;
-typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
-
-typedef struct CUDAContextT CUDAContext;
-typedef struct CUDAKernelT CUDAKernel;
-typedef struct CUDADevicePtrT CUDADevicePtr;
-
-PollyGPUContext *polly_initContextCUDA();
-PollyGPUContext *polly_initContextCL();
-PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
-                                  const char *KernelName);
-void polly_freeKernel(PollyGPUFunction *Kernel);
-void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
-                                long MemSize);
-void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
-                                long MemSize);
-void polly_synchronizeDevice();
-void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                        unsigned int GridDimY, unsigned int BlockSizeX,
-                        unsigned int BlockSizeY, unsigned int BlockSizeZ,
-                        void **Parameters);
-void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
-void polly_freeContext(PollyGPUContext *Context);
-
-// Note that polly_{malloc/free}Managed are currently not used by Polly.
-// We use them in COSMO by replacing all malloc with polly_mallocManaged and all
-// frees with cudaFree, so we can get managed memory "automatically".
-// Needless to say, this is a hack.
-// Please make sure that this code is not present in Polly when 2018 rolls in.
-// If this is still present, ping Siddharth Bhat <siddu.druid@gmail.com>
-void *polly_mallocManaged(size_t size);
-void polly_freeManaged(void *mem);
-#endif /* GPUJIT_H_ */
diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c
deleted file mode 100644
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ /dev/null
@@ -1,1856 +0,0 @@
-/******************** GPUJIT.c - GPUJIT Execution Engine **********************/
-/*                                                                            */
-/* Part of the LLVM Project, under the Apache License v2.0 with LLVM          */
-/* Exceptions.                                                                */
-/* See https://llvm.org/LICENSE.txt for license information.                  */
-/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    */
-/*                                                                            */
-/******************************************************************************/
-/*                                                                            */
-/*  This file implements GPUJIT, a ptx string execution engine for GPU.       */
-/*                                                                            */
-/******************************************************************************/
-
-#include "GPUJIT.h"
-
-#ifdef HAS_LIBCUDART
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif /* HAS_LIBCUDART */
-
-#ifdef HAS_LIBOPENCL
-#ifdef __APPLE__
-#include <OpenCL/opencl.h>
-#else
-#include <CL/cl.h>
-#endif /* __APPLE__ */
-#endif /* HAS_LIBOPENCL */
-
-#include <assert.h>
-#include <dlfcn.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-static int DebugMode;
-static int CacheMode;
-#define max(x, y) ((x) > (y) ? (x) : (y))
-
-static PollyGPURuntime Runtime = RUNTIME_NONE;
-
-static void debug_print(const char *format, ...) {
-  if (!DebugMode)
-    return;
-
-  va_list args;
-  va_start(args, format);
-  vfprintf(stderr, format, args);
-  va_end(args);
-}
-#define dump_function() debug_print("-> %s\n", __func__)
-
-#define KERNEL_CACHE_SIZE 10
-
-static void err_runtime() __attribute__((noreturn));
-static void err_runtime() {
-  fprintf(stderr, "Runtime not correctly initialized.\n");
-  exit(-1);
-}
-
-struct PollyGPUContextT {
-  void *Context;
-};
-
-struct PollyGPUFunctionT {
-  void *Kernel;
-};
-
-struct PollyGPUDevicePtrT {
-  void *DevicePtr;
-};
-
-/******************************************************************************/
-/*                                  OpenCL                                    */
-/******************************************************************************/
-#ifdef HAS_LIBOPENCL
-
-struct OpenCLContextT {
-  cl_context Context;
-  cl_command_queue CommandQueue;
-};
-
-struct OpenCLKernelT {
-  cl_kernel Kernel;
-  cl_program Program;
-  const char *BinaryString;
-};
-
-struct OpenCLDevicePtrT {
-  cl_mem MemObj;
-};
-
-/* Dynamic library handles for the OpenCL runtime library. */
-static void *HandleOpenCL;
-static void *HandleOpenCLBeignet;
-
-/* Type-defines of function pointer to OpenCL Runtime API. */
-typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
-                                     cl_platform_id *Platforms,
-                                     cl_uint *NumPlatforms);
-static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
-
-typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
-                                   cl_device_type DeviceType,
-                                   cl_uint NumEntries, cl_device_id *Devices,
-                                   cl_uint *NumDevices);
-static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
-
-typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
-                                    cl_device_info ParamName,
-                                    size_t ParamValueSize, void *ParamValue,
-                                    size_t *ParamValueSizeRet);
-static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
-
-typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
-                                    size_t ParamValueSize, void *ParamValue,
-                                    size_t *ParamValueSizeRet);
-static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
-
-typedef cl_context clCreateContextFcnTy(
-    const cl_context_properties *Properties, cl_uint NumDevices,
-    const cl_device_id *Devices,
-    void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
-                                 size_t CB, void *UserData),
-    void *UserData, cl_int *ErrcodeRet);
-static clCreateContextFcnTy *clCreateContextFcnPtr;
-
-typedef cl_command_queue
-clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
-                          cl_command_queue_properties Properties,
-                          cl_int *ErrcodeRet);
-static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
-
-typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
-                                   size_t Size, void *HostPtr,
-                                   cl_int *ErrcodeRet);
-static clCreateBufferFcnTy *clCreateBufferFcnPtr;
-
-typedef cl_int
-clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
-                          cl_bool BlockingWrite, size_t Offset, size_t Size,
-                          const void *Ptr, cl_uint NumEventsInWaitList,
-                          const cl_event *EventWaitList, cl_event *Event);
-static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
-
-typedef cl_program
-clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
-                                  const cl_device_id *DeviceList,
-                                  const char *Filename, cl_int *ErrcodeRet);
-static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
-
-typedef cl_program clCreateProgramWithBinaryFcnTy(
-    cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
-    const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
-    cl_int *ErrcodeRet);
-static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
-
-typedef cl_int clBuildProgramFcnTy(
-    cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
-    const char *Options,
-    void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
-    void *UserData);
-static clBuildProgramFcnTy *clBuildProgramFcnPtr;
-
-typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
-                                      const char *KernelName,
-                                      cl_int *ErrcodeRet);
-static clCreateKernelFcnTy *clCreateKernelFcnPtr;
-
-typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
-                                   size_t ArgSize, const void *ArgValue);
-static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
-
-typedef cl_int clEnqueueNDRangeKernelFcnTy(
-    cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
-    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
-    const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
-    const cl_event *EventWaitList, cl_event *Event);
-static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
-
-typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
-                                        cl_mem Buffer, cl_bool BlockingRead,
-                                        size_t Offset, size_t Size, void *Ptr,
-                                        cl_uint NumEventsInWaitList,
-                                        const cl_event *EventWaitList,
-                                        cl_event *Event);
-static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
-
-typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
-static clFlushFcnTy *clFlushFcnPtr;
-
-typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
-static clFinishFcnTy *clFinishFcnPtr;
-
-typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
-static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
-
-typedef cl_int clReleaseProgramFcnTy(cl_program Program);
-static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
-
-typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
-static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
-
-typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
-static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
-
-typedef cl_int clReleaseContextFcnTy(cl_context Context);
-static clReleaseContextFcnTy *clReleaseContextFcnPtr;
-
-static void *getAPIHandleCL(void *Handle, const char *FuncName) {
-  char *Err;
-  void *FuncPtr;
-  dlerror();
-  FuncPtr = dlsym(Handle, FuncName);
-  if ((Err = dlerror()) != 0) {
-    fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
-    return 0;
-  }
-  return FuncPtr;
-}
-
-static int initialDeviceAPILibrariesCL() {
-  HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
-  HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
-  if (!HandleOpenCL) {
-    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
-    return 0;
-  }
-  return 1;
-}
-
-/* Get function pointer to OpenCL Runtime API.
- *
- * Note that compilers conforming to the ISO C standard are required to
- * generate a warning if a conversion from a void * pointer to a function
- * pointer is attempted as in the following statements. The warning
- * of this kind of cast may not be emitted by clang and new versions of gcc
- * as it is valid on POSIX 2008. For compilers required to generate a warning,
- * we temporarily disable -Wpedantic, to avoid bloating the output with
- * unnecessary warnings.
- *
- * Reference:
- * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
- */
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpedantic"
-static int initialDeviceAPIsCL() {
-  if (initialDeviceAPILibrariesCL() == 0)
-    return 0;
-
-  // FIXME: We are now always selecting the Intel Beignet driver if it is
-  // available on the system, instead of a possible NVIDIA or AMD OpenCL
-  // API. This selection should occurr based on the target architecture
-  // chosen when compiling.
-  void *Handle =
-      (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
-
-  clGetPlatformIDsFcnPtr =
-      (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
-
-  clGetDeviceIDsFcnPtr =
-      (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
-
-  clGetDeviceInfoFcnPtr =
-      (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
-
-  clGetKernelInfoFcnPtr =
-      (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
-
-  clCreateContextFcnPtr =
-      (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
-
-  clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
-      Handle, "clCreateCommandQueue");
-
-  clCreateBufferFcnPtr =
-      (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
-
-  clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
-      Handle, "clEnqueueWriteBuffer");
-
-  if (HandleOpenCLBeignet)
-    clCreateProgramWithLLVMIntelFcnPtr =
-        (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
-            Handle, "clCreateProgramWithLLVMIntel");
-
-  clCreateProgramWithBinaryFcnPtr =
-      (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
-          Handle, "clCreateProgramWithBinary");
-
-  clBuildProgramFcnPtr =
-      (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
-
-  clCreateKernelFcnPtr =
-      (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
-
-  clSetKernelArgFcnPtr =
-      (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
-
-  clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
-      Handle, "clEnqueueNDRangeKernel");
-
-  clEnqueueReadBufferFcnPtr =
-      (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
-
-  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
-
-  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
-
-  clReleaseKernelFcnPtr =
-      (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
-
-  clReleaseProgramFcnPtr =
-      (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
-
-  clReleaseMemObjectFcnPtr =
-      (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
-
-  clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
-      Handle, "clReleaseCommandQueue");
-
-  clReleaseContextFcnPtr =
-      (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
-
-  return 1;
-}
-#pragma GCC diagnostic pop
-
-/* Context and Device. */
-static PollyGPUContext *GlobalContext = NULL;
-static cl_device_id GlobalDeviceID = NULL;
-
-/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
-static void printOpenCLError(int Error);
-
-static void checkOpenCLError(int Ret, const char *format, ...) {
-  if (Ret == CL_SUCCESS)
-    return;
-
-  printOpenCLError(Ret);
-  va_list args;
-  va_start(args, format);
-  vfprintf(stderr, format, args);
-  va_end(args);
-  exit(-1);
-}
-
-static PollyGPUContext *initContextCL() {
-  dump_function();
-
-  PollyGPUContext *Context;
-
-  cl_platform_id PlatformID = NULL;
-  cl_device_id DeviceID = NULL;
-  cl_uint NumDevicesRet;
-  cl_int Ret;
-
-  char DeviceRevision[256];
-  char DeviceName[256];
-  size_t DeviceRevisionRetSize, DeviceNameRetSize;
-
-  static __thread PollyGPUContext *CurrentContext = NULL;
-
-  if (CurrentContext)
-    return CurrentContext;
-
-  /* Get API handles. */
-  if (initialDeviceAPIsCL() == 0) {
-    fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
-    exit(-1);
-  }
-
-  /* Get number of devices that support OpenCL. */
-  static const int NumberOfPlatforms = 1;
-  Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
-  checkOpenCLError(Ret, "Failed to get platform IDs.\n");
-  // TODO: Extend to CL_DEVICE_TYPE_ALL?
-  static const int NumberOfDevices = 1;
-  Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
-                             &DeviceID, &NumDevicesRet);
-  checkOpenCLError(Ret, "Failed to get device IDs.\n");
-
-  GlobalDeviceID = DeviceID;
-  if (NumDevicesRet == 0) {
-    fprintf(stderr, "There is no device supporting OpenCL.\n");
-    exit(-1);
-  }
-
-  /* Get device revision. */
-  Ret =
-      clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
-                            DeviceRevision, &DeviceRevisionRetSize);
-  checkOpenCLError(Ret, "Failed to fetch device revision.\n");
-
-  /* Get device name. */
-  Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
-                              DeviceName, &DeviceNameRetSize);
-  checkOpenCLError(Ret, "Failed to fetch device name.\n");
-
-  debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
-
-  /* Create context on the device. */
-  Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
-  if (Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
-    exit(-1);
-  }
-  Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
-  if (Context->Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
-    exit(-1);
-  }
-  ((OpenCLContext *)Context->Context)->Context =
-      clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
-  checkOpenCLError(Ret, "Failed to create context.\n");
-
-  static const int ExtraProperties = 0;
-  ((OpenCLContext *)Context->Context)->CommandQueue =
-      clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
-                                 DeviceID, ExtraProperties, &Ret);
-  checkOpenCLError(Ret, "Failed to create command queue.\n");
-
-  if (CacheMode)
-    CurrentContext = Context;
-
-  GlobalContext = Context;
-  return Context;
-}
-
-static void freeKernelCL(PollyGPUFunction *Kernel) {
-  dump_function();
-
-  if (CacheMode)
-    return;
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
-  checkOpenCLError(Ret, "Failed to flush command queue.\n");
-  Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
-  checkOpenCLError(Ret, "Failed to finish command queue.\n");
-
-  if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
-    cl_int Ret =
-        clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
-    checkOpenCLError(Ret, "Failed to release kernel.\n");
-  }
-
-  if (((OpenCLKernel *)Kernel->Kernel)->Program) {
-    cl_int Ret =
-        clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
-    checkOpenCLError(Ret, "Failed to release program.\n");
-  }
-
-  if (Kernel->Kernel)
-    free((OpenCLKernel *)Kernel->Kernel);
-
-  if (Kernel)
-    free(Kernel);
-}
-
-static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
-                                     const char *KernelName) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
-  static __thread int NextCacheItem = 0;
-
-  for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
-    // We exploit here the property that all Polly-ACC kernels are allocated
-    // as global constants, hence a pointer comparision is sufficient to
-    // determin equality.
-    if (KernelCache[i] &&
-        ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
-            BinaryBuffer) {
-      debug_print("  -> using cached kernel\n");
-      return KernelCache[i];
-    }
-  }
-
-  PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
-  if (Function == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
-    exit(-1);
-  }
-  Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
-  if (Function->Kernel == 0) {
-    fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
-    exit(-1);
-  }
-
-  if (!GlobalDeviceID) {
-    fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-
-  if (HandleOpenCLBeignet) {
-    // This is a workaround, since clCreateProgramWithLLVMIntel only
-    // accepts a filename to a valid llvm-ir file as an argument, instead
-    // of accepting the BinaryBuffer directly.
-    char FileName[] = "/tmp/polly_kernelXXXXXX";
-    int File = mkstemp(FileName);
-    write(File, BinaryBuffer, strlen(BinaryBuffer));
-
-    ((OpenCLKernel *)Function->Kernel)->Program =
-        clCreateProgramWithLLVMIntelFcnPtr(
-            ((OpenCLContext *)GlobalContext->Context)->Context, 1,
-            &GlobalDeviceID, FileName, &Ret);
-    checkOpenCLError(Ret, "Failed to create program from llvm.\n");
-    close(File);
-    unlink(FileName);
-  } else {
-    size_t BinarySize = strlen(BinaryBuffer);
-    ((OpenCLKernel *)Function->Kernel)->Program =
-        clCreateProgramWithBinaryFcnPtr(
-            ((OpenCLContext *)GlobalContext->Context)->Context, 1,
-            &GlobalDeviceID, (const size_t *)&BinarySize,
-            (const unsigned char **)&BinaryBuffer, NULL, &Ret);
-    checkOpenCLError(Ret, "Failed to create program from binary.\n");
-  }
-
-  Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
-                             &GlobalDeviceID, NULL, NULL, NULL);
-  checkOpenCLError(Ret, "Failed to build program.\n");
-
-  ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
-      ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
-  checkOpenCLError(Ret, "Failed to create kernel.\n");
-
-  ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
-
-  if (CacheMode) {
-    if (KernelCache[NextCacheItem])
-      freeKernelCL(KernelCache[NextCacheItem]);
-
-    KernelCache[NextCacheItem] = Function;
-
-    NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
-  }
-
-  return Function;
-}
-
-static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
-                                   long MemSize) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  Ret = clEnqueueWriteBufferFcnPtr(
-      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
-      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
-      HostData, 0, NULL, NULL);
-  checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
-}
-
-static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
-                                   long MemSize) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  Ret = clEnqueueReadBufferFcnPtr(
-      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
-      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
-      HostData, 0, NULL, NULL);
-  checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
-}
-
-static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                           unsigned int GridDimY, unsigned int BlockDimX,
-                           unsigned int BlockDimY, unsigned int BlockDimZ,
-                           void **Parameters) {
-  dump_function();
-
-  cl_int Ret;
-  cl_uint NumArgs;
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
-  Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
-                              sizeof(cl_uint), &NumArgs, NULL);
-  checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
-
-  /* Argument sizes are stored at the end of the Parameters array. */
-  for (cl_uint i = 0; i < NumArgs; i++) {
-    Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
-                               *((int *)Parameters[NumArgs + i]),
-                               (void *)Parameters[i]);
-    checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
-  }
-
-  unsigned int GridDimZ = 1;
-  size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
-                              BlockDimZ * GridDimZ};
-  size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
-
-  static const int WorkDim = 3;
-  OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
-  Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
-                                     WorkDim, NULL, GlobalWorkSize,
-                                     LocalWorkSize, 0, NULL, NULL);
-  checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
-}
-
-static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
-  cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
-  checkOpenCLError(Ret, "Failed to free device memory.\n");
-
-  free(DevPtr);
-  free(Allocation);
-}
-
-static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
-  if (DevData == 0) {
-    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
-    exit(-1);
-  }
-  DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
-  if (DevData->DevicePtr == 0) {
-    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
-      clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
-                           CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
-  checkOpenCLError(Ret,
-                   "Allocate memory for GPU device memory pointer failed.\n");
-
-  return DevData;
-}
-
-static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
-  return (void *)DevPtr->MemObj;
-}
-
-static void synchronizeDeviceCL() {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
-      CL_SUCCESS) {
-    fprintf(stderr, "Synchronizing device and host memory failed.\n");
-    exit(-1);
-  }
-}
-
-static void freeContextCL(PollyGPUContext *Context) {
-  dump_function();
-
-  cl_int Ret;
-
-  GlobalContext = NULL;
-
-  OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
-  if (Ctx->CommandQueue) {
-    Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
-    checkOpenCLError(Ret, "Could not release command queue.\n");
-  }
-
-  if (Ctx->Context) {
-    Ret = clReleaseContextFcnPtr(Ctx->Context);
-    checkOpenCLError(Ret, "Could not release context.\n");
-  }
-
-  free(Ctx);
-  free(Context);
-}
-
-static void printOpenCLError(int Error) {
-
-  switch (Error) {
-  case CL_SUCCESS:
-    // Success, don't print an error.
-    break;
-
-  // JIT/Runtime errors.
-  case CL_DEVICE_NOT_FOUND:
-    fprintf(stderr, "Device not found.\n");
-    break;
-  case CL_DEVICE_NOT_AVAILABLE:
-    fprintf(stderr, "Device not available.\n");
-    break;
-  case CL_COMPILER_NOT_AVAILABLE:
-    fprintf(stderr, "Compiler not available.\n");
-    break;
-  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-    fprintf(stderr, "Mem object allocation failure.\n");
-    break;
-  case CL_OUT_OF_RESOURCES:
-    fprintf(stderr, "Out of resources.\n");
-    break;
-  case CL_OUT_OF_HOST_MEMORY:
-    fprintf(stderr, "Out of host memory.\n");
-    break;
-  case CL_PROFILING_INFO_NOT_AVAILABLE:
-    fprintf(stderr, "Profiling info not available.\n");
-    break;
-  case CL_MEM_COPY_OVERLAP:
-    fprintf(stderr, "Mem copy overlap.\n");
-    break;
-  case CL_IMAGE_FORMAT_MISMATCH:
-    fprintf(stderr, "Image format mismatch.\n");
-    break;
-  case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-    fprintf(stderr, "Image format not supported.\n");
-    break;
-  case CL_BUILD_PROGRAM_FAILURE:
-    fprintf(stderr, "Build program failure.\n");
-    break;
-  case CL_MAP_FAILURE:
-    fprintf(stderr, "Map failure.\n");
-    break;
-  case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-    fprintf(stderr, "Misaligned sub buffer offset.\n");
-    break;
-  case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-    fprintf(stderr, "Exec status error for events in wait list.\n");
-    break;
-  case CL_COMPILE_PROGRAM_FAILURE:
-    fprintf(stderr, "Compile program failure.\n");
-    break;
-  case CL_LINKER_NOT_AVAILABLE:
-    fprintf(stderr, "Linker not available.\n");
-    break;
-  case CL_LINK_PROGRAM_FAILURE:
-    fprintf(stderr, "Link program failure.\n");
-    break;
-  case CL_DEVICE_PARTITION_FAILED:
-    fprintf(stderr, "Device partition failed.\n");
-    break;
-  case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-    fprintf(stderr, "Kernel arg info not available.\n");
-    break;
-
-  // Compiler errors.
-  case CL_INVALID_VALUE:
-    fprintf(stderr, "Invalid value.\n");
-    break;
-  case CL_INVALID_DEVICE_TYPE:
-    fprintf(stderr, "Invalid device type.\n");
-    break;
-  case CL_INVALID_PLATFORM:
-    fprintf(stderr, "Invalid platform.\n");
-    break;
-  case CL_INVALID_DEVICE:
-    fprintf(stderr, "Invalid device.\n");
-    break;
-  case CL_INVALID_CONTEXT:
-    fprintf(stderr, "Invalid context.\n");
-    break;
-  case CL_INVALID_QUEUE_PROPERTIES:
-    fprintf(stderr, "Invalid queue properties.\n");
-    break;
-  case CL_INVALID_COMMAND_QUEUE:
-    fprintf(stderr, "Invalid command queue.\n");
-    break;
-  case CL_INVALID_HOST_PTR:
-    fprintf(stderr, "Invalid host pointer.\n");
-    break;
-  case CL_INVALID_MEM_OBJECT:
-    fprintf(stderr, "Invalid memory object.\n");
-    break;
-  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-    fprintf(stderr, "Invalid image format descriptor.\n");
-    break;
-  case CL_INVALID_IMAGE_SIZE:
-    fprintf(stderr, "Invalid image size.\n");
-    break;
-  case CL_INVALID_SAMPLER:
-    fprintf(stderr, "Invalid sampler.\n");
-    break;
-  case CL_INVALID_BINARY:
-    fprintf(stderr, "Invalid binary.\n");
-    break;
-  case CL_INVALID_BUILD_OPTIONS:
-    fprintf(stderr, "Invalid build options.\n");
-    break;
-  case CL_INVALID_PROGRAM:
-    fprintf(stderr, "Invalid program.\n");
-    break;
-  case CL_INVALID_PROGRAM_EXECUTABLE:
-    fprintf(stderr, "Invalid program executable.\n");
-    break;
-  case CL_INVALID_KERNEL_NAME:
-    fprintf(stderr, "Invalid kernel name.\n");
-    break;
-  case CL_INVALID_KERNEL_DEFINITION:
-    fprintf(stderr, "Invalid kernel definition.\n");
-    break;
-  case CL_INVALID_KERNEL:
-    fprintf(stderr, "Invalid kernel.\n");
-    break;
-  case CL_INVALID_ARG_INDEX:
-    fprintf(stderr, "Invalid arg index.\n");
-    break;
-  case CL_INVALID_ARG_VALUE:
-    fprintf(stderr, "Invalid arg value.\n");
-    break;
-  case CL_INVALID_ARG_SIZE:
-    fprintf(stderr, "Invalid arg size.\n");
-    break;
-  case CL_INVALID_KERNEL_ARGS:
-    fprintf(stderr, "Invalid kernel args.\n");
-    break;
-  case CL_INVALID_WORK_DIMENSION:
-    fprintf(stderr, "Invalid work dimension.\n");
-    break;
-  case CL_INVALID_WORK_GROUP_SIZE:
-    fprintf(stderr, "Invalid work group size.\n");
-    break;
-  case CL_INVALID_WORK_ITEM_SIZE:
-    fprintf(stderr, "Invalid work item size.\n");
-    break;
-  case CL_INVALID_GLOBAL_OFFSET:
-    fprintf(stderr, "Invalid global offset.\n");
-    break;
-  case CL_INVALID_EVENT_WAIT_LIST:
-    fprintf(stderr, "Invalid event wait list.\n");
-    break;
-  case CL_INVALID_EVENT:
-    fprintf(stderr, "Invalid event.\n");
-    break;
-  case CL_INVALID_OPERATION:
-    fprintf(stderr, "Invalid operation.\n");
-    break;
-  case CL_INVALID_GL_OBJECT:
-    fprintf(stderr, "Invalid GL object.\n");
-    break;
-  case CL_INVALID_BUFFER_SIZE:
-    fprintf(stderr, "Invalid buffer size.\n");
-    break;
-  case CL_INVALID_MIP_LEVEL:
-    fprintf(stderr, "Invalid mip level.\n");
-    break;
-  case CL_INVALID_GLOBAL_WORK_SIZE:
-    fprintf(stderr, "Invalid global work size.\n");
-    break;
-  case CL_INVALID_PROPERTY:
-    fprintf(stderr, "Invalid property.\n");
-    break;
-  case CL_INVALID_IMAGE_DESCRIPTOR:
-    fprintf(stderr, "Invalid image descriptor.\n");
-    break;
-  case CL_INVALID_COMPILER_OPTIONS:
-    fprintf(stderr, "Invalid compiler options.\n");
-    break;
-  case CL_INVALID_LINKER_OPTIONS:
-    fprintf(stderr, "Invalid linker options.\n");
-    break;
-  case CL_INVALID_DEVICE_PARTITION_COUNT:
-    fprintf(stderr, "Invalid device partition count.\n");
-    break;
-  case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
-    fprintf(stderr, "Invalid pipe size.\n");
-    break;
-  case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
-    fprintf(stderr, "Invalid device queue.\n");
-    break;
-
-  // NVIDIA specific error.
-  case -9999:
-    fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
-    break;
-
-  default:
-    fprintf(stderr, "Unknown error code!\n");
-    break;
-  }
-}
-
-#endif /* HAS_LIBOPENCL */
-/******************************************************************************/
-/*                                   CUDA                                     */
-/******************************************************************************/
-#ifdef HAS_LIBCUDART
-
-struct CUDAContextT {
-  CUcontext Cuda;
-};
-
-struct CUDAKernelT {
-  CUfunction Cuda;
-  CUmodule CudaModule;
-  const char *BinaryString;
-};
-
-struct CUDADevicePtrT {
-  CUdeviceptr Cuda;
-};
-
-/* Dynamic library handles for the CUDA and CUDA runtime library. */
-static void *HandleCuda;
-static void *HandleCudaRT;
-
-/* Type-defines of function pointer to CUDA driver APIs. */
-typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
-static CuMemAllocFcnTy *CuMemAllocFcnPtr;
-
-typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
-                                                unsigned int);
-static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;
-
-typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
-    CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
-    unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
-    void **KernelParams, void **Extra);
-static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
-
-typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
-static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
-
-typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
-static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
-
-typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
-static CuMemFreeFcnTy *CuMemFreeFcnPtr;
-
-typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
-static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
-
-typedef CUresult CUDAAPI CuProfilerStopFcnTy();
-static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
-
-typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
-static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
-
-typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
-static CuInitFcnTy *CuInitFcnPtr;
-
-typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
-static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
-
-typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
-static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
-
-typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
-static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
-
-typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
-static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
-
-typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
-                                                 unsigned int, CUjit_option *,
-                                                 void **);
-static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
-
-typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
-                                               const void *Image);
-static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
-
-typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
-                                                  const char *);
-static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
-
-typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
-static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
-
-typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
-static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
-
-typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
-                                            CUjitInputType Type, void *Data,
-                                            size_t Size, const char *Name,
-                                            unsigned int NumOptions,
-                                            CUjit_option *Options,
-                                            void **OptionValues);
-static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
-
-typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
-                                           CUjit_option *Options,
-                                           void **OptionValues,
-                                           CUlinkState *StateOut);
-static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
-
-typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
-                                             size_t *SizeOut);
-static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
-
-typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
-static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
-
-typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
-static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
-
-/* Type-defines of function pointer ot CUDA runtime APIs. */
-typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
-static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
-
-static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
-  char *Err;
-  void *FuncPtr;
-  dlerror();
-  FuncPtr = dlsym(Handle, FuncName);
-  if ((Err = dlerror()) != 0) {
-    fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
-    return 0;
-  }
-  return FuncPtr;
-}
-
-static int initialDeviceAPILibrariesCUDA() {
-  HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
-  if (!HandleCuda) {
-    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
-    return 0;
-  }
-
-  HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
-  if (!HandleCudaRT) {
-    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
-    return 0;
-  }
-
-  return 1;
-}
-
-/* Get function pointer to CUDA Driver APIs.
- *
- * Note that compilers conforming to the ISO C standard are required to
- * generate a warning if a conversion from a void * pointer to a function
- * pointer is attempted as in the following statements. The warning
- * of this kind of cast may not be emitted by clang and new versions of gcc
- * as it is valid on POSIX 2008. For compilers required to generate a warning,
- * we temporarily disable -Wpedantic, to avoid bloating the output with
- * unnecessary warnings.
- *
- * Reference:
- * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
- */
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpedantic"
-static int initialDeviceAPIsCUDA() {
-  if (initialDeviceAPILibrariesCUDA() == 0)
-    return 0;
-
-  CuLaunchKernelFcnPtr =
-      (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
-
-  CuMemAllocFcnPtr =
-      (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
-
-  CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
-      HandleCuda, "cuMemAllocManaged");
-
-  CuMemFreeFcnPtr =
-      (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
-
-  CuMemcpyDtoHFcnPtr =
-      (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
-
-  CuMemcpyHtoDFcnPtr =
-      (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
-
-  CuModuleUnloadFcnPtr =
-      (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
-
-  CuProfilerStopFcnPtr =
-      (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
-
-  CuCtxDestroyFcnPtr =
-      (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
-
-  CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
-
-  CuDeviceGetCountFcnPtr =
-      (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
-
-  CuDeviceGetFcnPtr =
-      (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
-
-  CuCtxCreateFcnPtr =
-      (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
-
-  CuCtxGetCurrentFcnPtr =
-      (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
-
-  CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
-      HandleCuda, "cuModuleLoadDataEx");
-
-  CuModuleLoadDataFcnPtr =
-      (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
-
-  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
-      HandleCuda, "cuModuleGetFunction");
-
-  CuDeviceComputeCapabilityFcnPtr =
-      (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
-          HandleCuda, "cuDeviceComputeCapability");
-
-  CuDeviceGetNameFcnPtr =
-      (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
-
-  CuLinkAddDataFcnPtr =
-      (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
-
-  CuLinkCreateFcnPtr =
-      (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
-
-  CuLinkCompleteFcnPtr =
-      (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
-
-  CuLinkDestroyFcnPtr =
-      (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
-
-  CuCtxSynchronizeFcnPtr =
-      (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
-
-  /* Get function pointer to CUDA Runtime APIs. */
-  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
-      HandleCudaRT, "cudaThreadSynchronize");
-
-  return 1;
-}
-#pragma GCC diagnostic pop
-
-static PollyGPUContext *initContextCUDA() {
-  dump_function();
-  PollyGPUContext *Context;
-  CUdevice Device;
-
-  int Major = 0, Minor = 0, DeviceID = 0;
-  char DeviceName[256];
-  int DeviceCount = 0;
-
-  static __thread PollyGPUContext *CurrentContext = NULL;
-
-  if (CurrentContext)
-    return CurrentContext;
-
-  /* Get API handles. */
-  if (initialDeviceAPIsCUDA() == 0) {
-    fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
-    exit(-1);
-  }
-
-  if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
-    fprintf(stderr, "Initializing the CUDA driver API failed.\n");
-    exit(-1);
-  }
-
-  /* Get number of devices that supports CUDA. */
-  CuDeviceGetCountFcnPtr(&DeviceCount);
-  if (DeviceCount == 0) {
-    fprintf(stderr, "There is no device supporting CUDA.\n");
-    exit(-1);
-  }
-
-  CuDeviceGetFcnPtr(&Device, 0);
-
-  /* Get compute capabilities and the device name. */
-  CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
-  CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
-  debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
-
-  /* Create context on the device. */
-  Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
-  if (Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
-    exit(-1);
-  }
-  Context->Context = malloc(sizeof(CUDAContext));
-  if (Context->Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
-    exit(-1);
-  }
-
-  // In cases where managed memory is used, it is quite likely that
-  // `cudaMallocManaged` / `polly_mallocManaged` was called before
-  // `polly_initContext` was called.
-  //
-  // If `polly_initContext` calls `CuCtxCreate` when there already was a
-  // pre-existing context created by the runtime API, this causes code running
-  // on P100 to hang. So, we query for a pre-existing context to try and use.
-  // If there is no pre-existing context, we create a new context
-
-  // The possible pre-existing context from previous runtime API calls.
-  CUcontext MaybeRuntimeAPIContext;
-  if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
-    fprintf(stderr, "cuCtxGetCurrent failed.\n");
-    exit(-1);
-  }
-
-  // There was no previous context, initialise it.
-  if (MaybeRuntimeAPIContext == NULL) {
-    if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
-                          Device) != CUDA_SUCCESS) {
-      fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
-      exit(-1);
-    }
-  } else {
-    ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
-  }
-
-  if (CacheMode)
-    CurrentContext = Context;
-
-  return Context;
-}
-
-static void freeKernelCUDA(PollyGPUFunction *Kernel) {
-  dump_function();
-
-  if (CacheMode)
-    return;
-
-  if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
-    CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
-
-  if (Kernel->Kernel)
-    free((CUDAKernel *)Kernel->Kernel);
-
-  if (Kernel)
-    free(Kernel);
-}
-
-static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
-                                       const char *KernelName) {
-  dump_function();
-
-  static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
-  static __thread int NextCacheItem = 0;
-
-  for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
-    // We exploit here the property that all Polly-ACC kernels are allocated
-    // as global constants, hence a pointer comparision is sufficient to
-    // determin equality.
-    if (KernelCache[i] &&
-        ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
-      debug_print("  -> using cached kernel\n");
-      return KernelCache[i];
-    }
-  }
-
-  PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
-  if (Function == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
-    exit(-1);
-  }
-  Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
-  if (Function->Kernel == 0) {
-    fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
-    exit(-1);
-  }
-
-  CUresult Res;
-  CUlinkState LState;
-  CUjit_option Options[6];
-  void *OptionVals[6];
-  float Walltime = 0;
-  unsigned long LogSize = 8192;
-  char ErrorLog[8192], InfoLog[8192];
-  void *CuOut;
-  size_t OutSize;
-
-  // Setup linker options
-  // Return walltime from JIT compilation
-  Options[0] = CU_JIT_WALL_TIME;
-  OptionVals[0] = (void *)&Walltime;
-  // Pass a buffer for info messages
-  Options[1] = CU_JIT_INFO_LOG_BUFFER;
-  OptionVals[1] = (void *)InfoLog;
-  // Pass the size of the info buffer
-  Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-  OptionVals[2] = (void *)LogSize;
-  // Pass a buffer for error message
-  Options[3] = CU_JIT_ERROR_LOG_BUFFER;
-  OptionVals[3] = (void *)ErrorLog;
-  // Pass the size of the error buffer
-  Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-  OptionVals[4] = (void *)LogSize;
-  // Make the linker verbose
-  Options[5] = CU_JIT_LOG_VERBOSE;
-  OptionVals[5] = (void *)1;
-
-  memset(ErrorLog, 0, sizeof(ErrorLog));
-
-  CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
-  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
-                            strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
-  if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
-    exit(-1);
-  }
-
-  Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
-  if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Complete ptx linker step failed.\n");
-    fprintf(stderr, "\n%s\n", ErrorLog);
-    exit(-1);
-  }
-
-  debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
-              InfoLog);
-
-  Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
-                               CuOut);
-  if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Loading ptx assembly text failed.\n");
-    exit(-1);
-  }
-
-  Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
-                                  ((CUDAKernel *)Function->Kernel)->CudaModule,
-                                  KernelName);
-  if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Loading kernel function failed.\n");
-    exit(-1);
-  }
-
-  CuLinkDestroyFcnPtr(LState);
-
-  ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
-
-  if (CacheMode) {
-    if (KernelCache[NextCacheItem])
-      freeKernelCUDA(KernelCache[NextCacheItem]);
-
-    KernelCache[NextCacheItem] = Function;
-
-    NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
-  }
-
-  return Function;
-}
-
-static void synchronizeDeviceCUDA() {
-  dump_function();
-  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
-    fprintf(stderr, "Synchronizing device and host memory failed.\n");
-    exit(-1);
-  }
-}
-
-static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
-                                     long MemSize) {
-  dump_function();
-
-  CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
-  CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
-}
-
-static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
-                                     long MemSize) {
-  dump_function();
-
-  if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
-                         MemSize) != CUDA_SUCCESS) {
-    fprintf(stderr, "Copying results from device to host memory failed.\n");
-    exit(-1);
-  }
-}
-
-static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                             unsigned int GridDimY, unsigned int BlockDimX,
-                             unsigned int BlockDimY, unsigned int BlockDimZ,
-                             void **Parameters) {
-  dump_function();
-
-  unsigned GridDimZ = 1;
-  unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
-  CUstream Stream = 0;
-  void **Extra = 0;
-
-  CUresult Res;
-  Res =
-      CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
-                           GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
-                           SharedMemBytes, Stream, Parameters, Extra);
-  if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Launching CUDA kernel failed.\n");
-    exit(-1);
-  }
-}
-
-// Maximum number of managed memory pointers.
-#define DEFAULT_MAX_POINTERS 4000
-// For the rationale behing a list of free pointers, see `polly_freeManaged`.
-void **g_managedptrs;
-unsigned long long g_nmanagedptrs = 0;
-unsigned long long g_maxmanagedptrs = 0;
-
-__attribute__((constructor)) static void initManagedPtrsBuffer() {
-  g_maxmanagedptrs = DEFAULT_MAX_POINTERS;
-  const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS");
-  if (maxManagedPointersString)
-    g_maxmanagedptrs = atoll(maxManagedPointersString);
-
-  g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs);
-}
-
-// Add a pointer as being allocated by cuMallocManaged
-void addManagedPtr(void *mem) {
-  assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!");
-  assert(g_nmanagedptrs < g_maxmanagedptrs &&
-         "We have hit the maximum number of "
-         "managed pointers allowed. Set the "
-         "POLLY_MAX_MANAGED_POINTERS environment variable. ");
-  g_managedptrs[g_nmanagedptrs++] = mem;
-}
-
-int isManagedPtr(void *mem) {
-  for (unsigned long long i = 0; i < g_nmanagedptrs; i++) {
-    if (g_managedptrs[i] == mem)
-      return 1;
-  }
-  return 0;
-}
-
-void freeManagedCUDA(void *mem) {
-  dump_function();
-
-  // In a real-world program this was used (COSMO), there were more `free`
-  // calls in the original source than `malloc` calls. Hence, replacing all
-  // `free`s with `cudaFree` does not work, since we would try to free
-  // 'illegal' memory.
-  // As a quick fix, we keep a free list and check if `mem` is a managed memory
-  // pointer. If it is, we call `cudaFree`.
-  // If not, we pass it along to the underlying allocator.
-  // This is a hack, and can be removed if the underlying issue is fixed.
-  if (isManagedPtr(mem)) {
-    if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
-      fprintf(stderr, "cudaFree failed.\n");
-      exit(-1);
-    }
-    return;
-  } else {
-    free(mem);
-  }
-}
-
-void *mallocManagedCUDA(size_t size) {
-  // Note: [Size 0 allocations]
-  // Sometimes, some runtime computation of size could create a size of 0
-  // for an allocation. In these cases, we do not wish to fail.
-  // The CUDA API fails on size 0 allocations.
-  // So, we allocate size a minimum of size 1.
-  if (!size && DebugMode)
-    fprintf(stderr, "cudaMallocManaged called with size 0. "
-                    "Promoting to size 1");
-  size = max(size, 1);
-  PollyGPUContext *_ = polly_initContextCUDA();
-  assert(_ && "polly_initContextCUDA failed");
-
-  void *newMemPtr;
-  const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
-                                               CU_MEM_ATTACH_GLOBAL);
-  if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
-    exit(-1);
-  }
-  addManagedPtr(newMemPtr);
-  return newMemPtr;
-}
-
-static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
-  CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
-  free(DevPtr);
-  free(Allocation);
-}
-
-static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
-  if (!MemSize && DebugMode)
-    fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. "
-                    "Promoting to size 1");
-  // see: [Size 0 allocations]
-  MemSize = max(MemSize, 1);
-  dump_function();
-
-  PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
-  if (DevData == 0) {
-    fprintf(stderr,
-            "Allocate memory for GPU device memory pointer failed."
-            " Line: %d | Size: %ld\n",
-            __LINE__, MemSize);
-    exit(-1);
-  }
-  DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
-  if (DevData->DevicePtr == 0) {
-    fprintf(stderr,
-            "Allocate memory for GPU device memory pointer failed."
-            " Line: %d | Size: %ld\n",
-            __LINE__, MemSize);
-    exit(-1);
-  }
-
-  CUresult Res =
-      CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
-
-  if (Res != CUDA_SUCCESS) {
-    fprintf(stderr,
-            "Allocate memory for GPU device memory pointer failed."
-            " Line: %d | Size: %ld\n",
-            __LINE__, MemSize);
-    exit(-1);
-  }
-
-  return DevData;
-}
-
-static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
-  return (void *)DevPtr->Cuda;
-}
-
-static void freeContextCUDA(PollyGPUContext *Context) {
-  dump_function();
-
-  CUDAContext *Ctx = (CUDAContext *)Context->Context;
-  if (Ctx->Cuda) {
-    CuProfilerStopFcnPtr();
-    CuCtxDestroyFcnPtr(Ctx->Cuda);
-    free(Ctx);
-    free(Context);
-  }
-
-  dlclose(HandleCuda);
-  dlclose(HandleCudaRT);
-}
-
-#endif /* HAS_LIBCUDART */
-/******************************************************************************/
-/*                                    API                                     */
-/******************************************************************************/
-
-PollyGPUContext *polly_initContext() {
-  DebugMode = getenv("POLLY_DEBUG") != 0;
-  CacheMode = getenv("POLLY_NOCACHE") == 0;
-
-  dump_function();
-
-  PollyGPUContext *Context;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    Context = initContextCUDA();
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    Context = initContextCL();
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return Context;
-}
-
-void polly_freeKernel(PollyGPUFunction *Kernel) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    freeKernelCUDA(Kernel);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    freeKernelCL(Kernel);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
-                                  const char *KernelName) {
-  dump_function();
-
-  PollyGPUFunction *Function;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    Function = getKernelCUDA(BinaryBuffer, KernelName);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    Function = getKernelCL(BinaryBuffer, KernelName);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return Function;
-}
-
-void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
-                                long MemSize) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    copyFromHostToDeviceCL(HostData, DevData, MemSize);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
-                                long MemSize) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    copyFromDeviceToHostCL(DevData, HostData, MemSize);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                        unsigned int GridDimY, unsigned int BlockDimX,
-                        unsigned int BlockDimY, unsigned int BlockDimZ,
-                        void **Parameters) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
-                     BlockDimZ, Parameters);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
-                   Parameters);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    freeDeviceMemoryCUDA(Allocation);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    freeDeviceMemoryCL(Allocation);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
-  dump_function();
-
-  PollyGPUDevicePtr *DevData;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    DevData = allocateMemoryForDeviceCUDA(MemSize);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    DevData = allocateMemoryForDeviceCL(MemSize);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return DevData;
-}
-
-void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  void *DevPtr;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    DevPtr = getDevicePtrCUDA(Allocation);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    DevPtr = getDevicePtrCL(Allocation);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return DevPtr;
-}
-
-void polly_synchronizeDevice() {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    synchronizeDeviceCUDA();
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    synchronizeDeviceCL();
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_freeContext(PollyGPUContext *Context) {
-  dump_function();
-
-  if (CacheMode)
-    return;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    freeContextCUDA(Context);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    freeContextCL(Context);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_freeManaged(void *mem) {
-  dump_function();
-
-#ifdef HAS_LIBCUDART
-  freeManagedCUDA(mem);
-#else
-  fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
-  exit(-1);
-#endif
-}
-
-void *polly_mallocManaged(size_t size) {
-  dump_function();
-
-#ifdef HAS_LIBCUDART
-  return mallocManagedCUDA(size);
-#else
-  fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
-  exit(-1);
-#endif
-}
-
-/* Initialize GPUJIT with CUDA as runtime library. */
-PollyGPUContext *polly_initContextCUDA() {
-#ifdef HAS_LIBCUDART
-  Runtime = RUNTIME_CUDA;
-  return polly_initContext();
-#else
-  fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
-  exit(-1);
-#endif /* HAS_LIBCUDART */
-}
-
-/* Initialize GPUJIT with OpenCL as runtime library. */
-PollyGPUContext *polly_initContextCL() {
-#ifdef HAS_LIBOPENCL
-  Runtime = RUNTIME_CL;
-  return polly_initContext();
-#else
-  fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
-  exit(-1);
-#endif /* HAS_LIBOPENCL */
-}
diff --git a/polly/tools/GPURuntime/LICENSE.TXT b/polly/tools/GPURuntime/LICENSE.TXT
deleted file mode 100644
--- a/polly/tools/GPURuntime/LICENSE.TXT
+++ /dev/null
@@ -1,310 +0,0 @@
-==============================================================================
-The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
-==============================================================================
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-    1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-    2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-    3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-    4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-    5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-    6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-    7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-    8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-    9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-    END OF TERMS AND CONDITIONS
-
-    APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-    Copyright [yyyy] [name of copyright owner]
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-
----- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
-
-==============================================================================
-Software from third parties included in the LLVM Project:
-==============================================================================
-The LLVM Project contains third party software which is under different license
-terms. All such code will be identified clearly using at least one of two
-mechanisms:
-1) It will be in a separate directory tree with its own `LICENSE.txt` or
-   `LICENSE` file at the top containing the specific license and restrictions
-   which apply to that software, or
-2) It will contain specific license and restriction terms at the top of every
-   file.
-
-==============================================================================
-Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
-==============================================================================
-
-The GPURuntime library is dual licensed under both the University of Illinois
-"BSD-Like" license and the MIT license.  As a user of this code you may choose
-to use it under either license.  As a contributor, you agree to allow your code
-to be used under both.
-
-Full text of the relevant licenses is included below.
-
-==============================================================================
-
-University of Illinois/NCSA
-Open Source License
-
-Copyright (c) 2009-2019 by the contributors listed in CREDITS.TXT
-
-All rights reserved.
-
-Developed by:
-
-    Polly Team
-
-    http://polly.llvm.org
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal with
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimers.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimers in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the names of the LLVM Team, University of Illinois at
-      Urbana-Champaign, nor the names of its contributors may be used to
-      endorse or promote products derived from this Software without specific
-      prior written permission.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-SOFTWARE.
-
-==============================================================================
-
-Copyright (c) 2009-2016 by the contributors listed in CREDITS.TXT
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
diff --git a/polly/www/documentation/gpgpucodegen.html b/polly/www/documentation/gpgpucodegen.html
deleted file mode 100644
--- a/polly/www/documentation/gpgpucodegen.html
+++ /dev/null
@@ -1,229 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-        "http://www.w3.org/TR/html4/strict.dtd">
-<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
-<html>
-<head>
-  <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-  <title>Polly - GPGPU Code Generation</title>
-  <link type="text/css" rel="stylesheet" href="../menu.css">
-  <link type="text/css" rel="stylesheet" href="../content.css">
-</head>
-<body>
-<div id="box">
-<!--#include virtual="../menu.html.incl"-->
-<div id="content">
-  <!--*********************************************************************-->
-  <h1>Polly - GPGPU Code Generation</h1>
-  <!--*********************************************************************-->
-<p><em>WARNING: This project was part of the Google Summer of Code 2012.
-It is currently not finished, but it is in the design and implementation stage.
-The ideas/plans described here may not yet be implemented in Polly and may
-change later on.</em></p>
-
-This project adds GPGPU code generation feature to Polly.
-
-<h2>Objective</h2>
-<p>The overall objective of this GSoC project is to create a preliminary
-   implementation of GPGPU code generation for Polly. With this addition, users
-   can parallelize some perfectly nested loops with Polly to execute on a
-   heterogeneous platform, composed of CPU and GPU.</p>
-<p>There are several successful projects about automatic source-to-source gpu
-   code transformation. C-to-CUDA[1] uses the standard Pluto algorithms for
-   computing an affine schedule and then applies a wavefront transformation to
-   obtain one sequential and n-1 parallel loops. The parallel loops are then
-   mapped onto the blocks and threads of GPU. PPCG[2] introduces some advanced
-   algorithms which can expose much more parallelism than other methods . And It
-   also introduces affine partition heuristics and code generation algorithms
-   for locality enhancement in the registers and shared memory.</p>
-<p>Since automatic GPGPU code generation is quite a complex problem and what we
-   target is a low-level intermediate representation, LLVM IR, rather than a
-   high-level language source, it is important for us to set a proper objective
-   as a start step to give a complete solution to GPGPU code generation for LLVM
-   IR.</p>
-<p>Firstly, we plan to target two kinds of relatively simple test cases. One is
-   comprised of pure parallel and perfectly nested loops, like the following
-   code.</p>
-<pre>
-parfor(int i=0 to M)
-  parfor(int j=0 to N)
-    LoopBody(i, j);
-</pre>
-<p>Another one is that all the loops in it are parallel except the inner-most
-   one, just like this:</p>
-<pre>
-parfor(int i=0 to M)
-  parfor(int j=0 to N)
-    non-parfor(int k=0 to K)
-      LoopBody(i, j, k);
-</pre>
-<p>The LoopBody part should be limited to instructions or functions calls
-   (intrinsics) which can be handled by LLVM's NVPTX backend.</p>
-<p>On the other hand, we focus on building a preliminary and scalable framework
-   of GPGPU code generation for polly. Thus we plan to employ relatively simple
-   tiling and mapping algorithms and optimize them later.</p>
-<h2>Work Flow</h2>
-<h3>GPGPU Code Generation In General</h3>
-<p>C-to-CUDA[1] and PPCG[2] propose similar steps to solve the automatic GPGPU
-   code generation problem.</p>
-<li>Look for parallel loops.</li>
-<li>Create a polyhedral model from the loops.</li>
-<li>Tile and map the loops to GPU blocks and threads.</li>
-<li>Determine where to place the data.</li>
-<h3>What has been done in Polly</h3>
-<p>Polly has implemented the 1st, 2nd and part of the 3rd of the above steps and
-   many other analysis and transformation passes.</p>
-<h3>What to do in Polly</h3>
-<p>Unlike many source-to-source optimizers such as C-to-CUDA and PPCG, Polly is
-   a low-level optimizer, which means we can't use a source-level compiler
-   (e.g. NVCC) to generate the final assembly for the device. We need manually
-   insert device driver API calls to execute the generated kernel assembly
-   text.</p>
-<p>In this project, we assume that the device driver library has provided an
-   interface to launch kernels in the form of assembly text. Fortunately, most
-   of the mainstream GPU vendors provide such a feature in their products (see
-   ptxjit of NVIDIA GPUs and CAL of AMD GPUs). Generally speaking, what we
-   are going to do in Polly is:</p>
-<li>Find a way to tile the parallel loops.</li>
-<li>Find a way to extract the loop body and transform it into thread-centric
-    parallel code.</li>
-<li>Find a way to store/load the thread-centric code into/from a device module.
-<li>Find a way to pass the target machine information and generate code of the
-    device module for the target.
-<li>Find a way to map the tiled loop to GPU blocks and threads.</li>
-<li>Find a way to insert CUDA synchronization operations on-demand.
-<li>Find a way to generate the memory copy operations between a host and a
-    device.</li>
-<li>Implement/Wrap a runtime library to serve as the execution engine for the
-    generated device code.</li>
-
-<h3>The Work Flow</h3>
-<p>In this section, we assume that the host cpu is X86 and the device is NVIDIA
-   CUDA-compatible. we will use the following test case to describe our work
-   flow.</p>
-<pre>
-for(i = 0; i &lt; 128; i++)
-      for(j = 0; j &lt; 128; j++)
-              A[i][j] = i*128 + j;
-</pre>
-<p>The work flow of our code generator is as follows.</p>
-<p>1.We first use Polly's jscop file importer to get a wanted 4-level parallel
-   tiled code.</p>
-The "schedule" part of the pre-optimization jscop file is as the following:
-<pre>
-"schedule" : "{ Stmt_for_body3[i0, i1] -&gt; schedule[0, i0, 0, i1, 0] }"
-</pre>
-The jscop file describing the tiling transformation is:
-<pre>
-"schedule" : "{ Stmt_for_body3[i0, i1] -&gt; schedule[0, o0, o1, o2, o3]:
-              o0 &gt;= 0 and o0 &lt;= 7 and o1 &gt;= 0 and o1 &lt;= 15 and
-              o2 &gt;= 0 and o2 &lt;= 7 and o3 &gt;= 0 and o3 &lt;= 15 and
-              i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
-</pre>
-We can test the schedule with the following command line.
-<pre>
-opt -load /path/to/polly/build/LLVMPolly.so -basic-aa -polly-import-jscop
-    -polly-ast -analyze -q ./test.ll
-    -polly-import-jscop-postfix=transformed+gpu
-</pre>
-The output of this schedule is:
-<pre>
-for (c2=0;c2&lt;=7;c2++) {
-  for (c3=0;c3&lt;=15;c3++) {
-    for (c4=0;c4&lt;=7;c4++) {
-      for (c5=0;c5&lt;=15;c5++) {
-        Stmt_for_body3(16*c2+c3,16*c4+c5);
-      }
-    }
-  }
-}
-</pre>
-Now we get a 4-dimensional parallel loops with a single SCoP statement in it.
-<p>2.We then extract the loop body (or the inner-most non-parallel loop) into a
-   LLVM function, tagging it with PTX_Kernel call convention.</p>
-<p>3.We extract the PTX_kernel function into a temporary module, set the target
-   triple (e.g. nvptx64-unknown-linux) for the module, transform the temporary
-   module into a string, store it in the original module and erase the
-   PTX_kernel function.</p>
-<p>4.We replace the loops with their GPGPU counterpart. The GPGPU part of code
-   is composed of a call to the llvm.codegen intrinsic and function calls to our
-   GPU runtime library.</p>
-<p>5.Finally, we generate the executable program with <em>llc</em> or run the
-   optimized LLVM IRs with a JIT compiler like  <em>lli</em>.</p>
-<h2>Usage</h2>
-<p>1. Apply the llvm.codegen intrinsic patch to LLVM code base.</p>
-<pre>cd /path/to/llvm/source
-git am /path/to/polly/source/utils/0001-Add-llvm.codegen-intrinsic.patch</pre>
-<p>2. Build the test case.</p>
-<pre>/path/to/polly/source/test/create_ll.sh test.c</pre>
-<p>3. Get and edit the jscop file (take function "gpu_codegen" as an example).
-</p>
-<pre>opt -load /path/to/polly/build/lib/LLVMPolly.so -basic-aa
-    -polly-export-jscop ./test.ll
-cp gpu_codegen___%for.cond---%for.end8.jscop
-   gpu_codegen___%for.cond---%for.end8.jscop.transformed+gpu
-vi gpu_codegen___%for.cond---%for.end8.jscop.transformed+gpu</pre>
-<p><em>(Please refer to section "The Work Flow" on how to edit the "schedule"
-       part of a statement)</em></p>
-<p>4. Optimize the code with GPGPU code generation.</p>
-<pre>opt -load /path/to/polly/build/lib/LLVMPolly.so -basic-aa
-    -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu
-    -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen ./test.ll -S
-    -o test.gpued.ll</pre>
-<p>5. Build the final assembly and executable.</p>
-<pre>llc test.gpued.ll -o test.s
-gcc test.s -lGPURuntime -o test</pre>
-<p><em>(Please make sure that LD_LIBRARY_PATH is set properly so that
-        /path/to/polly/build/lib/libGPURuntime.so is visible to gcc.)</em></p>
-<h2>TODO List</h2>
-
-<table class="wikitable" cellpadding="2">
-<tbody>
-<tr style="background: rgb(239, 239, 239)">
-  <th width="400px"> Tasks</th>
-  <th width="150px"> Status </th>
-  <th> Owner </th>
-</tr>
-<tr>
-<th align="left">Tiling the Parallel Loops with An External Jscop File</th>
-<td align="center" class='open'>Open, In Design</td>
-<td>Yabin Hu</td>
-</tr>
-<tr>
-<th align="left">GPU Runtime Library Implementation</th>
-<td align="center" class='inprogress'>Coding Finished, In Reviewing</td>
-<td></td>
-</tr>
-<tr>
-<th align="left">llvm.codegen Intrinsic Implementation</th>
-<td align="center" class='inprogress'>Coding Finished, To Be Reviewed</td>
-<td></td>
-</tr>
-<tr>
-<th align="left">Code Generation For Host</th>
-<td align="center" class='inprogress'>50% Done</td>
-<td></td>
-</tr>
-
-</tbody></table>
-
-<h2>References</h2>
-<li type="1" value="1">
-<em>Automatic C-to-CUDA Code Generation for Affine Programs. </em><br />
-    Muthu Manikandan Baskaran, J. Ramanujam and P. Sadayappan.<br />
-    International Conference on Compiler Construction (CC) 2010.<br />
-</li>
-<li type="1"><em>PPCG Project</em><br />
-<a href="http://freecode.com/projects/ppcg">http://freecode.com/projects/ppcg
-</a></li>
-<li type="1">
-<em>Where is the Data? Why You Cannot Debate GPU vs. CPU Performance Without the
-    Answer. </em><br />
-  Chris Gregg and Kim Hazelwood<br />
-  International Symposium on Performance Analysis of Systems and Software
-  (ISPASS) 2011.
-</li>
-<p></p>
-</div>
-</div>
-</body>
-</html>
diff --git a/polly/www/index.html b/polly/www/index.html
--- a/polly/www/index.html
+++ b/polly/www/index.html
@@ -28,8 +28,7 @@
   on integer polyhedra to analyze and optimize the memory access pattern of a
   program. We currently perform classical loop transformations, especially
   tiling and loop fusion to improve data-locality. Polly can also exploit
-  OpenMP level parallelism, expose SIMDization opportunities. Work has also be
-  done in the area of automatic GPU code generation.</p>
+  OpenMP level parallelism, expose SIMDization opportunities.</p>
 
   For many users, however, it's not the existing optimizations in Polly that are
   of most interest, but the new analyses and optimizations enabled by the Polly
diff --git a/polly/www/todo.html b/polly/www/todo.html
--- a/polly/www/todo.html
+++ b/polly/www/todo.html
@@ -342,14 +342,6 @@
 </td><td>Johannes
 <tr>
 <th align="left"> <a
-href="https://polly.llvm.org/documentation/gpgpucodegen.html">GPGPU Code
-Generation</a>
-</th><td class="niceinprogress">in progress
-</td><td>
-Yabin
-</td></tr>
-<tr>
-<th align="left"> <a
 href="https://polly.llvm.org/documentation/memaccess.html">Allow optimizers to
 change memory access functions</a>
 </th><td class="done"> Done