Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt
+++ CMakeLists.txt
@@ -152,10 +152,9 @@
 
 option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
 if (POLLY_ENABLE_GPGPU_CODEGEN)
-  # Do not require CUDA/OpenCL, as GPU code generation test cases can be run
-  # without a CUDA/OpenCL library.
+  # Do not require CUDA, as GPU code generation test cases can be run without
+  # a cuda library.
   FIND_PACKAGE(CUDA)
-  FIND_PACKAGE(OpenCL)
   set(GPU_CODEGEN TRUE)
 else(POLLY_ENABLE_GPGPU_CODEGEN)
   set(GPU_CODEGEN FALSE)
@@ -164,13 +163,8 @@
 
 # Support GPGPU code generation if the library is available.
 if (CUDALIB_FOUND)
-  add_definitions(-DHAS_LIBCUDART)
   INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
 endif(CUDALIB_FOUND)
-if (OpenCL_FOUND)
-  add_definitions(-DHAS_LIBOPENCL)
-  INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
-endif(OpenCL_FOUND)
 
 option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
 if (NOT POLLY_BUNDLED_ISL)
Index: include/polly/CodeGen/PPCGCodeGeneration.h
===================================================================
--- include/polly/CodeGen/PPCGCodeGeneration.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Take a scop created by ScopInfo and map it to GPU code using the ppcg
-// GPU mapping strategy.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_PPCGCODEGENERATION_H
-#define POLLY_PPCGCODEGENERATION_H
-
-/// The GPU Architecture to target.
-enum GPUArch { NVPTX64 };
-
-/// The GPU Runtime implementation to use.
-enum GPURuntime { CUDA, OpenCL };
-
-#endif // POLLY_PPCGCODEGENERATION_H
Index: include/polly/LinkAllPasses.h
===================================================================
--- include/polly/LinkAllPasses.h
+++ include/polly/LinkAllPasses.h
@@ -15,7 +15,6 @@
 #ifndef POLLY_LINKALLPASSES_H
 #define POLLY_LINKALLPASSES_H
 
-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/Config/config.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/Simplify.h"
@@ -49,8 +48,7 @@
 llvm::Pass *createIslAstInfoPass();
 llvm::Pass *createCodeGenerationPass();
 #ifdef GPU_CODEGEN
-llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
-                                         GPURuntime Runtime = GPURuntime::CUDA);
+llvm::Pass *createPPCGCodeGenerationPass();
 #endif
 llvm::Pass *createIslScheduleOptimizerPass();
 llvm::Pass *createFlattenSchedulePass();
Index: include/polly/ScopBuilder.h
===================================================================
--- include/polly/ScopBuilder.h
+++ include/polly/ScopBuilder.h
@@ -54,6 +54,33 @@
   // The Scop
   std::unique_ptr<Scop> scop;
 
+  // Methods for pattern matching against Fortran code generated by dragonegg.
+  // @{
+
+  /// Try to pattern match and find the array descriptor structure in case of a
+  /// fortran array accesss. succeeds on load/store into a fortran array that
+  /// has been allocated.
+  ///
+  /// @see polly::FortranArrayDescriptor
+  ///
+  /// @param Inst The load/store instruction that access the memory.
+  ///
+  /// @note assumes -polly-canonicalize has been run.
+  GlobalValue *findFortranArrayDescriptorForAllocArrayAccess(MemAccInst Inst);
+
+  /// Try to pattern match and find the array descriptor structure in case of a
+  /// fortran array accesss. succeeds on load/store into a fortran array that
+  /// has been allocated.
+  ///
+  /// @see polly::FortranArrayDescriptor
+  ///
+  /// @param Inst The load/store instruction that access the memory.
+  ///
+  /// @note assumes -polly-canonicalize has been run.
+  GlobalValue *
+  findFortranArrayDescriptorForNonAllocArrayAccess(MemAccInst Inst);
+  // @}
+
   // Build the SCoP for Region @p R.
   void buildScop(Region &R, AssumptionCache &AC);
 
Index: include/polly/ScopInfo.h
===================================================================
--- include/polly/ScopInfo.h
+++ include/polly/ScopInfo.h
@@ -602,6 +602,13 @@
 
   /// Updated access relation read from JSCOP file.
   isl_map *NewAccessRelation;
+
+  /// Fortran arrays that are created using "Allocate" are stored in terms
+  /// of a descriptor struct. This maintains a raw pointer to the memory,
+  /// along with auxiliary fields with information such as dimensions.
+  /// We hold a reference to the descriptor corresponding to a MemoryAccess
+  /// into a Fortran array. FAD for "Fortran Array Descriptor"
+  AssertingVH<GlobalValue> FAD;
   // @}
 
   __isl_give isl_basic_map *createBasicAccessMap(ScopStmt *Statement);
@@ -1006,6 +1013,10 @@
   /// Get the reduction type of this access
   ReductionType getReductionType() const { return RedType; }
 
+  /// Set the array descriptor corresponding to the Array on which the
+  /// memory access is performed.
+  void setFortranArrayDescriptor(GlobalValue *FAD);
+
   /// Update the original access relation.
   ///
   /// We need to update the original access relation during scop construction,
Index: lib/Analysis/ScopBuilder.cpp
===================================================================
--- lib/Analysis/ScopBuilder.cpp
+++ lib/Analysis/ScopBuilder.cpp
@@ -113,6 +113,159 @@
   }
 }
 
+/// This is matching against code generated by dragonegg after simplifier
+/// passes have been run.
+///
+/// This is trying to match against "@globaldescriptor", the descriptor
+/// of the Fortran array that is being accessed at load/store. This style
+/// of code is generated for arrays that have been allocated using "Allocate"
+/// in the same module
+///
+/// Pattern Match:
+/// 1. %mallocmem = i8* @malloc(i64 40)
+///
+/// 5. store i8* %mallocmem, i8** getelementptr inbounds
+/// (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"*
+/// @globaldescriptor, i64 0, i32 0), align 32
+///
+/// 2. %typedmem = bitcast i8* %mallocmem to <memtype>*
+///
+/// 3 is optional because if you are writing to the 0th index, you don't
+//    need a GEP.
+/// 3. [%slot = getelementptr inbounds i8, i8* %typedmem, i64 <index>]
+///
+/// 4.1 store/load <memtype> <val>, <memtype>* %typedmem, align 8
+/// 4.2 store/load <memtype> <val>, <memtype>* %slot, align 8
+GlobalValue *
+ScopBuilder::findFortranArrayDescriptorForAllocArrayAccess(MemAccInst Inst) {
+  // match: 4.1 & 4.2 store/load
+  if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst))
+    return nullptr;
+
+  // match: 4
+  if (Inst.getAlignment() != 8)
+    return nullptr;
+
+  Value *Address = Inst.getPointerOperand();
+
+  const BitCastInst *Bitcast = nullptr;
+  // [match: 3]
+  if (auto *Slot = dyn_cast<GetElementPtrInst>(Address)) {
+    Value *TypedMem = Slot->getPointerOperand();
+    // match: 2
+    Bitcast = dyn_cast<BitCastInst>(TypedMem);
+  } else {
+    // match: 2
+    Bitcast = dyn_cast<BitCastInst>(Address);
+  }
+
+  if (!Bitcast)
+    return nullptr;
+
+  auto *MallocMem = Bitcast->getOperand(0);
+
+  // match: 1
+  auto *MallocCall = dyn_cast<CallInst>(MallocMem);
+  if (!MallocCall)
+    return nullptr;
+
+  Function *MallocFn = MallocCall->getCalledFunction();
+  if (!(MallocFn && MallocFn->hasName() && MallocFn->getName() == "malloc"))
+    return nullptr;
+
+  // Find all uses the malloc'd memory.
+  // We are looking for a "store" into a struct with the type being the Fortran
+  // descriptor type
+  for (auto user : MallocMem->users()) {
+
+    /// match: 5
+    auto *MallocStore = dyn_cast<StoreInst>(user);
+    if (!MallocStore)
+      continue;
+
+    auto *DescriptorGEP =
+        dyn_cast<GEPOperator>(MallocStore->getPointerOperand());
+    if (!DescriptorGEP)
+      continue;
+
+    // match: 5
+    auto DescriptorType =
+        dyn_cast<StructType>(DescriptorGEP->getSourceElementType());
+    if (!(DescriptorType && DescriptorType->hasName()))
+      continue;
+
+    // name does not match expected name
+    if (!DescriptorType->getName().startswith("struct.array"))
+      continue;
+
+    GlobalValue *Descriptor =
+        dyn_cast<GlobalValue>(DescriptorGEP->getPointerOperand());
+
+    if (!Descriptor)
+      continue;
+
+    return Descriptor;
+  }
+
+  return nullptr;
+}
+
+/// This is matching against code generated by dragonegg after simplifier
+/// passes have been run.
+///
+/// This is trying to match against "@globaldescriptor", the descriptor
+/// of the Fortran array that is being accessed at load/store. This style
+/// of code is generated for arrays that have been declared global, and
+/// are being accessed across modules
+///
+/// Pattern Match:
+///  1. %mem = load double*, double** bitcast (%"struct.array1_real(kind=8)"*
+///    @globaldescriptor to double**), align 32
+///
+///  2 is optional because if you are writing to the 0th index, you don't
+///     need a GEP.
+///  2. [%slot = getelementptr inbounds i8, i8* %mem, i64 <index>]
+///
+///  3.1 store/load <memtype> <val>, <memtype>* %slot, align 8
+///  3.2 store/load <memtype> <val>, <memtype>* %mem, align 8
+GlobalValue *
+ScopBuilder::findFortranArrayDescriptorForNonAllocArrayAccess(MemAccInst Inst) {
+  // match: 3
+  if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst))
+    return nullptr;
+
+  // match: 3
+  if (Inst.getAlignment() != 8)
+    return nullptr;
+
+  Value *Slot = Inst.getPointerOperand();
+
+  LoadInst *MemLoad = nullptr;
+  // [match: 2]
+  if (auto *SlotGEP = dyn_cast<GetElementPtrInst>(Slot)) {
+    // match: 1
+    MemLoad = dyn_cast<LoadInst>(SlotGEP->getPointerOperand());
+  } else {
+    // match: 1
+    MemLoad = dyn_cast<LoadInst>(Slot);
+  }
+
+  if (!MemLoad)
+    return nullptr;
+
+  auto *BitcastOperator =
+      dyn_cast<BitCastOperator>(MemLoad->getPointerOperand());
+  if (!BitcastOperator)
+    return nullptr;
+
+  GlobalValue *Descriptor =
+      dyn_cast<GlobalValue>(BitcastOperator->getOperand(0));
+  if (!Descriptor)
+    return nullptr;
+
+  return Descriptor;
+}
+
 bool ScopBuilder::buildAccessMultiDimFixed(MemAccInst Inst, ScopStmt *Stmt) {
   Value *Val = Inst.getValueOperand();
   Type *ElementType = Val->getType();
@@ -532,9 +685,17 @@
     Type *ElementType, bool IsAffine, ArrayRef<const SCEV *> Subscripts,
     ArrayRef<const SCEV *> Sizes, Value *AccessValue) {
   ArrayBasePointers.insert(BaseAddress);
-  addMemoryAccess(MemAccInst->getParent(), MemAccInst, AccType, BaseAddress,
-                  ElementType, IsAffine, AccessValue, Subscripts, Sizes,
-                  MemoryKind::Array);
+  auto *MemAccess = addMemoryAccess(
+      MemAccInst->getParent(), MemAccInst, AccType, BaseAddress, ElementType,
+      IsAffine, AccessValue, Subscripts, Sizes, MemoryKind::Array);
+
+  // TODO: change to loop of function pointers?
+  if (GlobalValue *FAD =
+          findFortranArrayDescriptorForAllocArrayAccess(MemAccInst))
+    MemAccess->setFortranArrayDescriptor(FAD);
+  else if (GlobalValue *FAD =
+               findFortranArrayDescriptorForNonAllocArrayAccess(MemAccInst))
+    MemAccess->setFortranArrayDescriptor(FAD);
 }
 
 void ScopBuilder::ensureValueWrite(Instruction *Inst) {
Index: lib/Analysis/ScopInfo.cpp
===================================================================
--- lib/Analysis/ScopInfo.cpp
+++ lib/Analysis/ScopInfo.cpp
@@ -974,7 +974,7 @@
       Sizes(Sizes.begin(), Sizes.end()), AccessInstruction(AccessInst),
       AccessValue(AccessValue), IsAffine(Affine),
       Subscripts(Subscripts.begin(), Subscripts.end()), AccessRelation(nullptr),
-      NewAccessRelation(nullptr) {
+      NewAccessRelation(nullptr), FAD(nullptr) {
   static const std::string TypeStrings[] = {"", "_Read", "_Write", "_MayWrite"};
   const std::string Access = TypeStrings[AccType] + utostr(Stmt->size());
 
@@ -986,7 +986,8 @@
                            __isl_take isl_map *AccRel)
     : Kind(MemoryKind::Array), AccType(AccType), RedType(RT_NONE),
       Statement(Stmt), InvalidDomain(nullptr), AccessInstruction(nullptr),
-      IsAffine(true), AccessRelation(nullptr), NewAccessRelation(AccRel) {
+      IsAffine(true), AccessRelation(nullptr), NewAccessRelation(AccRel),
+      FAD(nullptr) {
   auto *ArrayInfoId = isl_map_get_tuple_id(NewAccessRelation, isl_dim_out);
   auto *SAI = ScopArrayInfo::getFromId(ArrayInfoId);
   Sizes.push_back(nullptr);
@@ -1022,6 +1023,22 @@
   return OS;
 }
 
+void MemoryAccess::setFortranArrayDescriptor(GlobalValue *FAD) {
+  this->FAD = FAD;
+
+// TODO: write checks to make sure it looks _exactly_ like a Fortran array
+// descriptor
+#ifdef NDEBUG
+  StructType *ty = dyn_cast<StructType>(Descriptor->getValueType());
+  assert(ty && "expected value of type Fortran array descriptor");
+  assert(ty->hasName() && ty->getName().startswith("struct.array") &&
+         "expected global to follow Fortran array descriptor type naming "
+         "convention");
+  assert(ty->getNumElements() == 4 &&
+         "expected layout to be like Fortran array descriptor type");
+#endif
+}
+
 void MemoryAccess::print(raw_ostream &OS) const {
   switch (AccType) {
   case READ:
@@ -1034,7 +1051,14 @@
     OS.indent(12) << "MayWriteAccess :=\t";
     break;
   }
+
   OS << "[Reduction Type: " << getReductionType() << "] ";
+
+  if (FAD) {
+    OS << "[Fortran array descriptor: " << FAD->getName();
+    OS << "] ";
+  };
+
   OS << "[Scalar: " << isScalarKind() << "]\n";
   OS.indent(16) << getOriginalAccessRelationStr() << ";\n";
   if (hasNewAccessRelation())
Index: lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- lib/CodeGen/PPCGCodeGeneration.cpp
+++ lib/CodeGen/PPCGCodeGeneration.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/CodeGen/IslAst.h"
 #include "polly/CodeGen/IslNodeBuilder.h"
 #include "polly/CodeGen/Utils.h"
@@ -154,9 +153,9 @@
   GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
                  const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
                  DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
-                 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
+                 gpu_prog *Prog)
       : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
-        Prog(Prog), Runtime(Runtime), Arch(Arch) {
+        Prog(Prog) {
     getExprBuilder().setIDToSAI(&IDToSAI);
   }
 
@@ -202,12 +201,6 @@
   /// The GPU program we generate code for.
   gpu_prog *Prog;
 
-  /// The GPU Runtime implementation to use (OpenCL or CUDA).
-  GPURuntime Runtime;
-
-  /// The GPU Architecture to target.
-  GPUArch Arch;
-
   /// Class to free isl_ids.
   class IslIdDeleter {
   public:
@@ -759,17 +752,7 @@
 }
 
 Value *GPUNodeBuilder::createCallInitContext() {
-  const char *Name;
-
-  switch (Runtime) {
-  case GPURuntime::CUDA:
-    Name = "polly_initContextCUDA";
-    break;
-  case GPURuntime::OpenCL:
-    Name = "polly_initContextCL";
-    break;
-  }
-
+  const char *Name = "polly_initContext";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
 
@@ -1045,15 +1028,7 @@
 
 void GPUNodeBuilder::createKernelSync() {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-
-  Function *Sync;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
-    break;
-  }
-
+  auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
   Builder.CreateCall(Sync, {});
 }
 
@@ -1459,12 +1434,7 @@
   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                               GPUModule.get());
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    FN->setCallingConv(CallingConv::PTX_Kernel);
-    break;
-  }
+  FN->setCallingConv(CallingConv::PTX_Kernel);
 
   auto Arg = FN->arg_begin();
   for (long i = 0; i < Kernel->n_array; i++) {
@@ -1525,19 +1495,12 @@
 }
 
 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
-  Intrinsic::ID IntrinsicsBID[2];
-  Intrinsic::ID IntrinsicsTID[3];
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
-    IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
+  Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
+                                   Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
 
-    IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
-    IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
-    IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
-    break;
-  }
+  Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
+                                   Intrinsic::nvvm_read_ptx_sreg_tid_y,
+                                   Intrinsic::nvvm_read_ptx_sreg_tid_z};
 
   auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
     std::string Name = isl_id_get_name(Id);
@@ -1686,18 +1649,11 @@
 
 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
                                           SetVector<Value *> &SubtreeValues) {
+
   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
   GPUModule.reset(new Module(Identifier, Builder.getContext()));
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    if (Runtime == GPURuntime::CUDA)
-      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
-    else if (Runtime == GPURuntime::OpenCL)
-      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
-    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
-    break;
-  }
+  GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+  GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
 
   Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
 
@@ -1718,21 +1674,7 @@
 }
 
 std::string GPUNodeBuilder::createKernelASM() {
-  llvm::Triple GPUTriple;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    switch (Runtime) {
-    case GPURuntime::CUDA:
-      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
-      break;
-    case GPURuntime::OpenCL:
-      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
-      break;
-    }
-    break;
-  }
-
+  llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
   std::string ErrMsg;
   auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
 
@@ -1743,17 +1685,9 @@
 
   TargetOptions Options;
   Options.UnsafeFPMath = FastMath;
-
-  std::string subtarget;
-
-  switch (Arch) {
-  case GPUArch::NVPTX64:
-    subtarget = CudaVersion;
-    break;
-  }
-
-  std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
-      GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
+  std::unique_ptr<TargetMachine> TargetM(
+      GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
+                                     Options, Optional<Reloc::Model>()));
 
   SmallString<0> ASMString;
   raw_svector_ostream ASMStream(ASMString);
@@ -1805,10 +1739,6 @@
 public:
   static char ID;
 
-  GPURuntime Runtime = GPURuntime::CUDA;
-
-  GPUArch Architecture = GPUArch::NVPTX64;
-
   /// The scop that is currently processed.
   Scop *S;
 
@@ -2592,7 +2522,7 @@
         executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
 
     GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
-                               StartBlock, Prog, Runtime, Architecture);
+                               StartBlock, Prog);
 
     // TODO: Handle LICM
     auto SplitBlock = StartBlock->getSinglePredecessor();
@@ -2680,12 +2610,7 @@
 
 char PPCGCodeGeneration::ID = 1;
 
-Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
-  PPCGCodeGeneration *generator = new PPCGCodeGeneration();
-  generator->Runtime = Runtime;
-  generator->Architecture = Arch;
-  return generator;
-}
+Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
 
 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
                       "Polly - Apply PPCG translation to SCOP", false, false)
Index: lib/Support/RegisterPasses.cpp
===================================================================
--- lib/Support/RegisterPasses.cpp
+++ lib/Support/RegisterPasses.cpp
@@ -23,7 +23,6 @@
 #include "polly/Canonicalization.h"
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/CodeGen/CodegenCleanup.h"
-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/DeLICM.h"
 #include "polly/DependenceInfo.h"
 #include "polly/FlattenSchedule.h"
@@ -102,23 +101,6 @@
                           ),
            cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory));
 
-#ifdef GPU_CODEGEN
-static cl::opt<GPURuntime> GPURuntimeChoice(
-    "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
-    cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
-                          "use the CUDA Runtime API"),
-               clEnumValN(GPURuntime::OpenCL, "libopencl",
-                          "use the OpenCL Runtime API")),
-    cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<GPUArch>
-    GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
-                  cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
-                                        "target NVIDIA 64-bit architecture")),
-                  cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
-                  cl::cat(PollyCategory));
-#endif
-
 VectorizerChoice polly::PollyVectorizerChoice;
 static cl::opt<polly::VectorizerChoice, true> Vectorizer(
     "polly-vectorizer", cl::desc("Select the vectorization strategy"),
@@ -327,8 +309,7 @@
 
   if (Target == TARGET_GPU) {
 #ifdef GPU_CODEGEN
-    PM.add(
-        polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice));
+    PM.add(polly::createPPCGCodeGenerationPass());
 #endif
   } else {
     switch (CodeGeneration) {
Index: test/FortranDetection/global-malloc-nonvectored.ll
===================================================================
--- /dev/null
+++ test/FortranDetection/global-malloc-nonvectored.ll
@@ -0,0 +1,143 @@
+; RUN: opt -S  -analyze -polly-process-unprofitable  -polly-remarks-minimal \
+; RUN: -polly-canonicalize -polly-scops -polly-dependences \
+; RUN: -debug-only=polly-dependence -polly-canonicalize -polly-allow-nonaffine \
+; RUN: -polly-ignore-aliasing   -polly-invariant-load-hoisting \
+; RUN: < %s| FileCheck %s
+;
+; MODULE src_soil
+; USE data_parameters, ONLY :   &
+;     wp,        & ! KIND-type parameter for real variables
+;     iintegers    ! KIND-type parameter for standard integer variables
+; IMPLICIT NONE
+; REAL (KIND = wp),     ALLOCATABLE, PRIVATE  :: &
+;   xdzs     (:)
+; CONTAINS
+; SUBROUTINE terra1(n)
+;   INTEGER, intent(in) :: n
+;   INTEGER (KIND=iintegers) ::  &
+;     j
+;   Allocate(xdzs(n));
+;    DO j = 2, n
+;         xdzs(j) = xdzs(j) * xdzs(j) + xdzs(j - 1)
+;   END DO
+; END SUBROUTINE terra1
+; END MODULE src_soil
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm "\09.ident\09\22GCC: (GNU) 4.6.4 LLVM: 3.3.1\22"
+
+%"struct.array1_real(kind=8)" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] }
+%struct.descriptor_dimension = type { i64, i64, i64 }
+
+@__src_soil_MOD_xdzs = unnamed_addr global %"struct.array1_real(kind=8)" zeroinitializer, align 32
+@.cst = private unnamed_addr constant [67 x i8] c"Integer overflow when calculating the amount of memory to allocate\00", align 64
+@.cst1 = private unnamed_addr constant [37 x i8] c"Allocation would exceed memory limit\00", align 64
+@.cst2 = private unnamed_addr constant [93 x i8] c"At line 23 of file /home/siddhart/cosmo-self-installation/cosmo-pompa/cosmo/src/src_soil.f90\00", align 64
+@.cst3 = private unnamed_addr constant [55 x i8] c"Attempting to allocate already allocated variable '%s'\00", align 64
+@.cst4 = private unnamed_addr constant [5 x i8] c"xdzs\00", align 8
+
+; Function Attrs: nounwind uwtable
+define void @__src_soil_MOD_terra1(i32* noalias nocapture %n) unnamed_addr #0 {
+entry:
+  store i64 537, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 2), align 16, !tbaa !0
+  store i64 1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 3, i64 0, i32 1), align 8, !tbaa !0
+  %0 = load i32, i32* %n, align 4, !tbaa !3
+  %1 = sext i32 %0 to i64
+  store i64 %1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 3, i64 0, i32 2), align 8, !tbaa !0
+  store i64 1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 3, i64 0, i32 0), align 8, !tbaa !0
+  %2 = icmp slt i32 %0, 0
+  %3 = select i1 %2, i64 0, i64 %1
+  %4 = icmp eq i64 %3, 0
+  br i1 %4, label %"16", label %"8"
+
+"8":                                              ; preds = %entry
+  %5 = sdiv i64 9223372036854775807, %1
+  %6 = icmp slt i64 %5, 1
+  %7 = icmp slt i32 %0, 1
+  %8 = shl nsw i64 %3, 3
+  %.2 = select i1 %7, i64 0, i64 %8
+  br i1 %6, label %"15", label %"16"
+
+"15":                                             ; preds = %"8"
+
+  unreachable
+
+"16":                                             ; preds = %"8", %entry
+  %.24 = phi i64 [ %.2, %"8" ], [ 0, %entry ]
+  %9 = load i8*, i8** getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 0), align 32, !tbaa !5
+  %10 = icmp eq i8* %9, null
+  br i1 %10, label %"17", label %"20"
+
+"17":                                             ; preds = %"16"
+  %11 = icmp ne i64 %.24, 0
+  %12 = select i1 %11, i64 %.24, i64 1
+  %13 = tail call noalias i8* @malloc(i64 %12) #2 ;<= 1. malloc
+  %14 = icmp eq i8* %13, null
+  br i1 %14, label %"18", label %"19"
+
+"18":                                             ; preds = %"17"
+  unreachable
+
+"19":                                             ; preds = %"17"
+  store i8* %13, i8** getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 0), align 32, !tbaa !5
+  store i64 -1, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 1), align 8, !tbaa !0
+  %15 = icmp sgt i32 %0, 1
+  br i1 %15, label %"21.preheader", label %return
+
+"21.preheader":                                   ; preds = %"19"
+  %16 = bitcast i8* %13 to double* ;<= 2. bitcast to double*
+  %17 = add i32 %0, 1
+  br label %"21"
+
+"20":                                             ; preds = %"16"
+  unreachable
+
+"21":                                             ; preds = %"21", %"21.preheader"
+  %18 = phi double [ undef, %"21.preheader" ], [ %23, %"21" ]
+  %indvars.iv = phi i64 [ 2, %"21.preheader" ], [ %indvars.iv.next, %"21" ]
+  %19 = add nsw i64 %indvars.iv, -1
+  %20 = getelementptr inbounds double, double* %16, i64 %19 ;<= 3. GEP
+  %21 = load double, double* %20, align 8, !tbaa !7
+  %22 = fmul double %21, %21
+  %23 = fadd double %22, %18
+  store double %23, double* %20, align 8, !tbaa !7 ;<= 4. store
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %17
+  br i1 %exitcond, label %return, label %"21"
+
+return:                                           ; preds = %"21", %"19"
+  ret void
+}
+
+; Function Attrs: noreturn
+declare void @_gfortran_runtime_error(i8*, ...) #1
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64) #2
+
+; Function Attrs: noreturn
+declare void @_gfortran_os_error(i8*) #1
+
+; Function Attrs: noreturn
+declare void @_gfortran_runtime_error_at(i8*, i8*, ...) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { noreturn }
+attributes #2 = { nounwind }
+attributes #3 = { noreturn nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"alias set 4: integer(kind=8)", !2}
+!2 = distinct !{!2}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"alias set 11: integer(kind=4)", !2}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"alias set 3: void*", !2}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"alias set 18: real(kind=8)", !2}
+
+; CHECK: ReadAccess :=	[Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0]
+; CHECK: MustWriteAccess :=	[Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0]
Index: test/FortranDetection/global-nonmalloc-nonvectored.ll
===================================================================
--- /dev/null
+++ test/FortranDetection/global-nonmalloc-nonvectored.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S  -analyze -polly-process-unprofitable  -polly-remarks-minimal \
+; RUN: -polly-canonicalize -polly-scops -polly-dependences \
+; RUN: -debug-only=polly-dependence -polly-canonicalize -polly-allow-nonaffine \
+; RUN: -polly-ignore-aliasing   -polly-invariant-load-hoisting \
+; RUN: < %s| FileCheck %s
+;
+; MODULE src_soil
+; USE data_parameters, ONLY :   &
+;     wp,        & ! KIND-type parameter for real variables
+;     iintegers    ! KIND-type parameter for standard integer variables
+; IMPLICIT NONE
+; REAL (KIND = wp),     ALLOCATABLE, PRIVATE  :: &
+;   xdzs     (:)
+; CONTAINS
+;
+; SUBROUTINE terra1(n)
+;   INTEGER, intent(in) :: n
+;
+;   INTEGER (KIND=iintegers) ::  &
+;     j
+;
+;    DO j = 22, n
+;         xdzs(j) = xdzs(j) * xdzs(j) + xdzs(j - 1)
+;   END DO
+; END SUBROUTINE terra1
+; END MODULE src_soil
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm "\09.ident\09\22GCC: (GNU) 4.6.4 LLVM: 3.3.1\22"
+
+%"struct.array1_real(kind=8)" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] }
+%struct.descriptor_dimension = type { i64, i64, i64 }
+
+@__src_soil_MOD_xdzs = unnamed_addr global %"struct.array1_real(kind=8)" zeroinitializer, align 32
+
+; Function Attrs: nounwind uwtable
+define void @__src_soil_MOD_terra1(i32* noalias nocapture %n) unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* %n, align 4, !tbaa !0
+  %1 = icmp sgt i32 %0, 21
+  br i1 %1, label %"3.preheader", label %return
+
+"3.preheader":                                    ; preds = %entry
+  %2 = load i64, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 1), align 8, !tbaa !3
+  %3 = load i8*, i8** getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @__src_soil_MOD_xdzs, i64 0, i32 0), align 32, !tbaa !5
+  %4 = bitcast i8* %3 to double*
+  %5 = add i32 %0, 1
+  br label %"3"
+
+"3":                                              ; preds = %"3", %"3.preheader"
+  %indvars.iv = phi i64 [ 22, %"3.preheader" ], [ %indvars.iv.next, %"3" ]
+  %6 = add nsw i64 %indvars.iv, %2
+  %7 = getelementptr inbounds double, double* %4, i64 %6
+  %8 = load double, double* %7, align 8, !tbaa !7
+  %9 = fmul double %8, %8
+  %10 = add nsw i64 %indvars.iv, -1
+  %11 = add nsw i64 %10, %2
+  %12 = getelementptr inbounds double, double* %4, i64 %11
+  %13 = load double, double* %12, align 8, !tbaa !7
+  %14 = fadd double %9, %13
+  store double %14, double* %7, align 8, !tbaa !7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %5
+  br i1 %exitcond, label %return, label %"3"
+
+return:                                           ; preds = %"3", %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"alias set 11: integer(kind=4)", !2}
+!2 = distinct !{!2}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"alias set 4: integer(kind=8)", !2}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"alias set 3: void*", !2}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"alias set 18: real(kind=8)", !2}
+
+; CHECK: ReadAccess :=	[Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0]
+; CHECK: ReadAccess :=	[Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0]
+; CHECK: MustWriteAccess :=	[Reduction Type: NONE] [Fortran array descriptor: __src_soil_MOD_xdzs] [Scalar: 0]
Index: test/GPGPU/cuda-managed-memory-simple.ll
===================================================================
--- test/GPGPU/cuda-managed-memory-simple.ll
+++ test/GPGPU/cuda-managed-memory-simple.ll
@@ -35,7 +35,7 @@
 ; CHECK-NOT: polly_freeDeviceMemory
 ; CHECK-NOT: polly_allocateMemoryForDevice
 
-; CHECK:       %13 = call i8* @polly_initContextCUDA()
+; CHECK:       %13 = call i8* @polly_initContext()
 ; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
 ; CHECK-NEXT:  %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
 ; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
@@ -46,7 +46,7 @@
 ; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
 ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
 ; CHECK-NEXT:  store i8* %19, i8** %18
-; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
 ; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; CHECK-NEXT:  call void @polly_freeKernel(i8* %20)
 ; CHECK-NEXT:  call void @polly_synchronizeDevice()
Index: test/GPGPU/size-cast.ll
===================================================================
--- test/GPGPU/size-cast.ll
+++ test/GPGPU/size-cast.ll
@@ -29,7 +29,7 @@
 ; CODE-NEXT:   if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
 ; CODE-NEXT:     Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
 
-; IR:        call i8* @polly_initContextCUDA()
+; IR:        call i8* @polly_initContext()
 ; IR-NEXT:   sext i32 %arg to i64
 ; IR-NEXT:   mul i64
 ; IR-NEXT:   @polly_allocateMemoryForDevice
Index: tools/CMakeLists.txt
===================================================================
--- tools/CMakeLists.txt
+++ tools/CMakeLists.txt
@@ -1,5 +1,5 @@
-if (CUDALIB_FOUND OR OpenCL_FOUND)
+if (CUDALIB_FOUND)
   add_subdirectory(GPURuntime)
-endif (CUDALIB_FOUND OR OpenCL_FOUND)
+endif (CUDALIB_FOUND)
 
 set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
Index: tools/GPURuntime/GPUJIT.h
===================================================================
--- tools/GPURuntime/GPUJIT.h
+++ tools/GPURuntime/GPUJIT.h
@@ -76,27 +76,12 @@
  *
  */
 
-typedef enum PollyGPURuntimeT {
-  RUNTIME_NONE,
-  RUNTIME_CUDA,
-  RUNTIME_CL
-} PollyGPURuntime;
-
 typedef struct PollyGPUContextT PollyGPUContext;
 typedef struct PollyGPUFunctionT PollyGPUFunction;
 typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
 
-typedef struct OpenCLContextT OpenCLContext;
-typedef struct OpenCLKernelT OpenCLKernel;
-typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
-
-typedef struct CUDAContextT CUDAContext;
-typedef struct CUDAKernelT CUDAKernel;
-typedef struct CUDADevicePtrT CUDADevicePtr;
-
-PollyGPUContext *polly_initContextCUDA();
-PollyGPUContext *polly_initContextCL();
-PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
+PollyGPUContext *polly_initContext();
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
                                   const char *KernelName);
 void polly_freeKernel(PollyGPUFunction *Kernel);
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
Index: tools/GPURuntime/GPUJIT.c
===================================================================
--- tools/GPURuntime/GPUJIT.c
+++ tools/GPURuntime/GPUJIT.c
@@ -12,20 +12,8 @@
 /******************************************************************************/
 
 #include "GPUJIT.h"
-
-#ifdef HAS_LIBCUDART
 #include <cuda.h>
 #include <cuda_runtime.h>
-#endif /* HAS_LIBCUDART */
-
-#ifdef HAS_LIBOPENCL
-#ifdef __APPLE__
-#include <OpenCL/opencl.h>
-#else
-#include <CL/cl.h>
-#endif
-#endif /* HAS_LIBOPENCL */
-
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -34,8 +22,6 @@
 static int DebugMode;
 static int CacheMode;
 
-static PollyGPURuntime Runtime = RUNTIME_NONE;
-
 static void debug_print(const char *format, ...) {
   if (!DebugMode)
     return;
@@ -47,853 +33,18 @@
 }
 #define dump_function() debug_print("-> %s\n", __func__)
 
-#define KERNEL_CACHE_SIZE 10
-
-static void err_runtime() {
-  fprintf(stderr, "Runtime not correctly initialized.\n");
-  exit(-1);
-}
-
+/* Define Polly's GPGPU data types. */
 struct PollyGPUContextT {
-  void *Context;
-};
-
-struct PollyGPUFunctionT {
-  void *Kernel;
-};
-
-struct PollyGPUDevicePtrT {
-  void *DevicePtr;
-};
-
-/******************************************************************************/
-/*                                  OpenCL                                    */
-/******************************************************************************/
-#ifdef HAS_LIBOPENCL
-
-struct OpenCLContextT {
-  cl_context Context;
-  cl_command_queue CommandQueue;
-};
-
-struct OpenCLKernelT {
-  cl_kernel Kernel;
-  cl_program Program;
-  const char *BinaryString;
-};
-
-struct OpenCLDevicePtrT {
-  cl_mem MemObj;
-};
-
-/* Dynamic library handles for the OpenCL runtime library. */
-static void *HandleOpenCL;
-
-/* Type-defines of function pointer to OpenCL Runtime API. */
-typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
-                                     cl_platform_id *Platforms,
-                                     cl_uint *NumPlatforms);
-static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
-
-typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
-                                   cl_device_type DeviceType,
-                                   cl_uint NumEntries, cl_device_id *Devices,
-                                   cl_uint *NumDevices);
-static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
-
-typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
-                                    cl_device_info ParamName,
-                                    size_t ParamValueSize, void *ParamValue,
-                                    size_t *ParamValueSizeRet);
-static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
-
-typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
-                                    size_t ParamValueSize, void *ParamValue,
-                                    size_t *ParamValueSizeRet);
-static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
-
-typedef cl_context clCreateContextFcnTy(
-    const cl_context_properties *Properties, cl_uint NumDevices,
-    const cl_device_id *Devices,
-    void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
-                                 size_t CB, void *UserData),
-    void *UserData, cl_int *ErrcodeRet);
-static clCreateContextFcnTy *clCreateContextFcnPtr;
-
-typedef cl_command_queue
-clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
-                          cl_command_queue_properties Properties,
-                          cl_int *ErrcodeRet);
-static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
-
-typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
-                                   size_t Size, void *HostPtr,
-                                   cl_int *ErrcodeRet);
-static clCreateBufferFcnTy *clCreateBufferFcnPtr;
-
-typedef cl_int
-clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
-                          cl_bool BlockingWrite, size_t Offset, size_t Size,
-                          const void *Ptr, cl_uint NumEventsInWaitList,
-                          const cl_event *EventWaitList, cl_event *Event);
-static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
-
-typedef cl_program clCreateProgramWithBinaryFcnTy(
-    cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
-    const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
-    cl_int *ErrcodeRet);
-static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
-
-typedef cl_int clBuildProgramFcnTy(
-    cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
-    const char *Options,
-    void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
-    void *UserData);
-static clBuildProgramFcnTy *clBuildProgramFcnPtr;
-
-typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
-                                      const char *KernelName,
-                                      cl_int *ErrcodeRet);
-static clCreateKernelFcnTy *clCreateKernelFcnPtr;
-
-typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
-                                   size_t ArgSize, const void *ArgValue);
-static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
-
-typedef cl_int clEnqueueNDRangeKernelFcnTy(
-    cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
-    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
-    const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
-    const cl_event *EventWaitList, cl_event *Event);
-static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
-
-typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
-                                        cl_mem Buffer, cl_bool BlockingRead,
-                                        size_t Offset, size_t Size, void *Ptr,
-                                        cl_uint NumEventsInWaitList,
-                                        const cl_event *EventWaitList,
-                                        cl_event *Event);
-static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
-
-typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
-static clFlushFcnTy *clFlushFcnPtr;
-
-typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
-static clFinishFcnTy *clFinishFcnPtr;
-
-typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
-static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
-
-typedef cl_int clReleaseProgramFcnTy(cl_program Program);
-static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
-
-typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
-static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
-
-typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
-static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
-
-typedef cl_int clReleaseContextFcnTy(cl_context Context);
-static clReleaseContextFcnTy *clReleaseContextFcnPtr;
-
-static void *getAPIHandleCL(void *Handle, const char *FuncName) {
-  char *Err;
-  void *FuncPtr;
-  dlerror();
-  FuncPtr = dlsym(Handle, FuncName);
-  if ((Err = dlerror()) != 0) {
-    fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
-    return 0;
-  }
-  return FuncPtr;
-}
-
-static int initialDeviceAPILibrariesCL() {
-  HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
-  if (!HandleOpenCL) {
-    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
-    return 0;
-  }
-  return 1;
-}
-
-static int initialDeviceAPIsCL() {
-  if (initialDeviceAPILibrariesCL() == 0)
-    return 0;
-
-  /* Get function pointer to OpenCL Runtime API.
-   *
-   * Note that compilers conforming to the ISO C standard are required to
-   * generate a warning if a conversion from a void * pointer to a function
-   * pointer is attempted as in the following statements. The warning
-   * of this kind of cast may not be emitted by clang and new versions of gcc
-   * as it is valid on POSIX 2008.
-   */
-  clGetPlatformIDsFcnPtr =
-      (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs");
-
-  clGetDeviceIDsFcnPtr =
-      (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs");
-
-  clGetDeviceInfoFcnPtr =
-      (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo");
-
-  clGetKernelInfoFcnPtr =
-      (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo");
-
-  clCreateContextFcnPtr =
-      (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext");
-
-  clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clCreateCommandQueue");
-
-  clCreateBufferFcnPtr =
-      (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer");
-
-  clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueWriteBuffer");
-
-  clCreateProgramWithBinaryFcnPtr =
-      (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
-          HandleOpenCL, "clCreateProgramWithBinary");
-
-  clBuildProgramFcnPtr =
-      (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram");
-
-  clCreateKernelFcnPtr =
-      (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel");
-
-  clSetKernelArgFcnPtr =
-      (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg");
-
-  clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueNDRangeKernel");
-
-  clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueReadBuffer");
-
-  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush");
-
-  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish");
-
-  clReleaseKernelFcnPtr =
-      (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel");
-
-  clReleaseProgramFcnPtr =
-      (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram");
-
-  clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clReleaseMemObject");
-
-  clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clReleaseCommandQueue");
-
-  clReleaseContextFcnPtr =
-      (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext");
-
-  return 1;
-}
-
-/* Context and Device. */
-static PollyGPUContext *GlobalContext = NULL;
-static cl_device_id GlobalDeviceID = NULL;
-
-/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
-static void printOpenCLError(int Error);
-
-static void checkOpenCLError(int Ret, const char *format, ...) {
-  if (Ret == CL_SUCCESS)
-    return;
-
-  printOpenCLError(Ret);
-  va_list args;
-  va_start(args, format);
-  vfprintf(stderr, format, args);
-  va_end(args);
-  exit(-1);
-}
-
-static PollyGPUContext *initContextCL() {
-  dump_function();
-
-  PollyGPUContext *Context;
-
-  cl_platform_id PlatformID = NULL;
-  cl_device_id DeviceID = NULL;
-  cl_uint NumDevicesRet;
-  cl_int Ret;
-
-  char DeviceRevision[256];
-  char DeviceName[256];
-  size_t DeviceRevisionRetSize, DeviceNameRetSize;
-
-  static __thread PollyGPUContext *CurrentContext = NULL;
-
-  if (CurrentContext)
-    return CurrentContext;
-
-  /* Get API handles. */
-  if (initialDeviceAPIsCL() == 0) {
-    fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
-    exit(-1);
-  }
-
-  /* Get number of devices that support OpenCL. */
-  static const int NumberOfPlatforms = 1;
-  Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
-  checkOpenCLError(Ret, "Failed to get platform IDs.\n");
-  // TODO: Extend to CL_DEVICE_TYPE_ALL?
-  static const int NumberOfDevices = 1;
-  Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
-                             &DeviceID, &NumDevicesRet);
-  checkOpenCLError(Ret, "Failed to get device IDs.\n");
-
-  GlobalDeviceID = DeviceID;
-  if (NumDevicesRet == 0) {
-    fprintf(stderr, "There is no device supporting OpenCL.\n");
-    exit(-1);
-  }
-
-  /* Get device revision. */
-  Ret =
-      clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
-                            DeviceRevision, &DeviceRevisionRetSize);
-  checkOpenCLError(Ret, "Failed to fetch device revision.\n");
-
-  /* Get device name. */
-  Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
-                              DeviceName, &DeviceNameRetSize);
-  checkOpenCLError(Ret, "Failed to fetch device name.\n");
-
-  debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
-
-  /* Create context on the device. */
-  Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
-  if (Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
-    exit(-1);
-  }
-  Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
-  if (Context->Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
-    exit(-1);
-  }
-  ((OpenCLContext *)Context->Context)->Context =
-      clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
-  checkOpenCLError(Ret, "Failed to create context.\n");
-
-  static const int ExtraProperties = 0;
-  ((OpenCLContext *)Context->Context)->CommandQueue =
-      clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
-                                 DeviceID, ExtraProperties, &Ret);
-  checkOpenCLError(Ret, "Failed to create command queue.\n");
-
-  if (CacheMode)
-    CurrentContext = Context;
-
-  GlobalContext = Context;
-  return Context;
-}
-
-static void freeKernelCL(PollyGPUFunction *Kernel) {
-  dump_function();
-
-  if (CacheMode)
-    return;
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
-  checkOpenCLError(Ret, "Failed to flush command queue.\n");
-  Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
-  checkOpenCLError(Ret, "Failed to finish command queue.\n");
-
-  if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
-    cl_int Ret =
-        clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
-    checkOpenCLError(Ret, "Failed to release kernel.\n");
-  }
-
-  if (((OpenCLKernel *)Kernel->Kernel)->Program) {
-    cl_int Ret =
-        clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
-    checkOpenCLError(Ret, "Failed to release program.\n");
-  }
-
-  if (Kernel->Kernel)
-    free((OpenCLKernel *)Kernel->Kernel);
-
-  if (Kernel)
-    free(Kernel);
-}
-
-static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
-                                     const char *KernelName) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
-  static __thread int NextCacheItem = 0;
-
-  for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
-    // We exploit here the property that all Polly-ACC kernels are allocated
-    // as global constants, hence a pointer comparision is sufficient to
-    // determin equality.
-    if (KernelCache[i] &&
-        ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
-            BinaryBuffer) {
-      debug_print("  -> using cached kernel\n");
-      return KernelCache[i];
-    }
-  }
-
-  PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
-  if (Function == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
-    exit(-1);
-  }
-  Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
-  if (Function->Kernel == 0) {
-    fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
-    exit(-1);
-  }
-
-  if (!GlobalDeviceID) {
-    fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  size_t BinarySize = strlen(BinaryBuffer);
-  ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
-      ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
-      (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
-      &Ret);
-  checkOpenCLError(Ret, "Failed to create program from binary.\n");
-
-  Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
-                             &GlobalDeviceID, NULL, NULL, NULL);
-  checkOpenCLError(Ret, "Failed to build program.\n");
-
-  ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
-      ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
-  checkOpenCLError(Ret, "Failed to create kernel.\n");
-
-  ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
-
-  if (CacheMode) {
-    if (KernelCache[NextCacheItem])
-      freeKernelCL(KernelCache[NextCacheItem]);
-
-    KernelCache[NextCacheItem] = Function;
-
-    NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
-  }
-
-  return Function;
-}
-
-static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
-                                   long MemSize) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  Ret = clEnqueueWriteBufferFcnPtr(
-      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
-      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
-      HostData, 0, NULL, NULL);
-  checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
-}
-
-static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
-                                   long MemSize) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  Ret = clEnqueueReadBufferFcnPtr(
-      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
-      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
-      HostData, 0, NULL, NULL);
-  checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
-}
-
-static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                           unsigned int GridDimY, unsigned int BlockDimX,
-                           unsigned int BlockDimY, unsigned int BlockDimZ,
-                           void **Parameters) {
-  dump_function();
-
-  cl_int Ret;
-  cl_uint NumArgs;
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
-  Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
-                              sizeof(cl_uint), &NumArgs, NULL);
-  checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
-
-  // TODO: Pass the size of the kernel arguments in to launchKernelCL, along
-  // with the arguments themselves. This is a dirty workaround that can be
-  // broken.
-  for (cl_uint i = 0; i < NumArgs; i++) {
-    Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]);
-    if (Ret == CL_INVALID_ARG_SIZE) {
-      Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]);
-      if (Ret == CL_INVALID_ARG_SIZE) {
-        Ret =
-            clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]);
-        if (Ret == CL_INVALID_ARG_SIZE) {
-          Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1,
-                                     (void *)Parameters[i]);
-          checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
-        }
-      }
-    }
-    if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) {
-      fprintf(stderr, "Failed to set Kernel argument.\n");
-      printOpenCLError(Ret);
-      exit(-1);
-    }
-  }
-
-  unsigned int GridDimZ = 1;
-  size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
-                              BlockDimZ * GridDimZ};
-  size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
-
-  static const int WorkDim = 3;
-  OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
-  Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
-                                     WorkDim, NULL, GlobalWorkSize,
-                                     LocalWorkSize, 0, NULL, NULL);
-  checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
-}
-
-static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
-  cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
-  checkOpenCLError(Ret, "Failed to free device memory.\n");
-
-  free(DevPtr);
-  free(Allocation);
-}
-
-static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
-  if (DevData == 0) {
-    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
-    exit(-1);
-  }
-  DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
-  if (DevData->DevicePtr == 0) {
-    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
-    exit(-1);
-  }
-
-  cl_int Ret;
-  ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
-      clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
-                           CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
-  checkOpenCLError(Ret,
-                   "Allocate memory for GPU device memory pointer failed.\n");
-
-  return DevData;
-}
-
-static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
-  return (void *)DevPtr->MemObj;
-}
-
-static void synchronizeDeviceCL() {
-  dump_function();
-
-  if (!GlobalContext) {
-    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
-    exit(-1);
-  }
-
-  if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
-      CL_SUCCESS) {
-    fprintf(stderr, "Synchronizing device and host memory failed.\n");
-    exit(-1);
-  }
-}
-
-static void freeContextCL(PollyGPUContext *Context) {
-  dump_function();
-
-  cl_int Ret;
-
-  GlobalContext = NULL;
-
-  OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
-  if (Ctx->CommandQueue) {
-    Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
-    checkOpenCLError(Ret, "Could not release command queue.\n");
-  }
-
-  if (Ctx->Context) {
-    Ret = clReleaseContextFcnPtr(Ctx->Context);
-    checkOpenCLError(Ret, "Could not release context.\n");
-  }
-
-  free(Ctx);
-  free(Context);
-}
-
-static void printOpenCLError(int Error) {
-
-  switch (Error) {
-  case CL_SUCCESS:
-    // Success, don't print an error.
-    break;
-
-  // JIT/Runtime errors.
-  case CL_DEVICE_NOT_FOUND:
-    fprintf(stderr, "Device not found.\n");
-    break;
-  case CL_DEVICE_NOT_AVAILABLE:
-    fprintf(stderr, "Device not available.\n");
-    break;
-  case CL_COMPILER_NOT_AVAILABLE:
-    fprintf(stderr, "Compiler not available.\n");
-    break;
-  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-    fprintf(stderr, "Mem object allocation failure.\n");
-    break;
-  case CL_OUT_OF_RESOURCES:
-    fprintf(stderr, "Out of resources.\n");
-    break;
-  case CL_OUT_OF_HOST_MEMORY:
-    fprintf(stderr, "Out of host memory.\n");
-    break;
-  case CL_PROFILING_INFO_NOT_AVAILABLE:
-    fprintf(stderr, "Profiling info not available.\n");
-    break;
-  case CL_MEM_COPY_OVERLAP:
-    fprintf(stderr, "Mem copy overlap.\n");
-    break;
-  case CL_IMAGE_FORMAT_MISMATCH:
-    fprintf(stderr, "Image format mismatch.\n");
-    break;
-  case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-    fprintf(stderr, "Image format not supported.\n");
-    break;
-  case CL_BUILD_PROGRAM_FAILURE:
-    fprintf(stderr, "Build program failure.\n");
-    break;
-  case CL_MAP_FAILURE:
-    fprintf(stderr, "Map failure.\n");
-    break;
-  case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-    fprintf(stderr, "Misaligned sub buffer offset.\n");
-    break;
-  case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-    fprintf(stderr, "Exec status error for events in wait list.\n");
-    break;
-  case CL_COMPILE_PROGRAM_FAILURE:
-    fprintf(stderr, "Compile program failure.\n");
-    break;
-  case CL_LINKER_NOT_AVAILABLE:
-    fprintf(stderr, "Linker not available.\n");
-    break;
-  case CL_LINK_PROGRAM_FAILURE:
-    fprintf(stderr, "Link program failure.\n");
-    break;
-  case CL_DEVICE_PARTITION_FAILED:
-    fprintf(stderr, "Device partition failed.\n");
-    break;
-  case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-    fprintf(stderr, "Kernel arg info not available.\n");
-    break;
-
-  // Compiler errors.
-  case CL_INVALID_VALUE:
-    fprintf(stderr, "Invalid value.\n");
-    break;
-  case CL_INVALID_DEVICE_TYPE:
-    fprintf(stderr, "Invalid device type.\n");
-    break;
-  case CL_INVALID_PLATFORM:
-    fprintf(stderr, "Invalid platform.\n");
-    break;
-  case CL_INVALID_DEVICE:
-    fprintf(stderr, "Invalid device.\n");
-    break;
-  case CL_INVALID_CONTEXT:
-    fprintf(stderr, "Invalid context.\n");
-    break;
-  case CL_INVALID_QUEUE_PROPERTIES:
-    fprintf(stderr, "Invalid queue properties.\n");
-    break;
-  case CL_INVALID_COMMAND_QUEUE:
-    fprintf(stderr, "Invalid command queue.\n");
-    break;
-  case CL_INVALID_HOST_PTR:
-    fprintf(stderr, "Invalid host pointer.\n");
-    break;
-  case CL_INVALID_MEM_OBJECT:
-    fprintf(stderr, "Invalid memory object.\n");
-    break;
-  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-    fprintf(stderr, "Invalid image format descriptor.\n");
-    break;
-  case CL_INVALID_IMAGE_SIZE:
-    fprintf(stderr, "Invalid image size.\n");
-    break;
-  case CL_INVALID_SAMPLER:
-    fprintf(stderr, "Invalid sampler.\n");
-    break;
-  case CL_INVALID_BINARY:
-    fprintf(stderr, "Invalid binary.\n");
-    break;
-  case CL_INVALID_BUILD_OPTIONS:
-    fprintf(stderr, "Invalid build options.\n");
-    break;
-  case CL_INVALID_PROGRAM:
-    fprintf(stderr, "Invalid program.\n");
-    break;
-  case CL_INVALID_PROGRAM_EXECUTABLE:
-    fprintf(stderr, "Invalid program executable.\n");
-    break;
-  case CL_INVALID_KERNEL_NAME:
-    fprintf(stderr, "Invalid kernel name.\n");
-    break;
-  case CL_INVALID_KERNEL_DEFINITION:
-    fprintf(stderr, "Invalid kernel definition.\n");
-    break;
-  case CL_INVALID_KERNEL:
-    fprintf(stderr, "Invalid kernel.\n");
-    break;
-  case CL_INVALID_ARG_INDEX:
-    fprintf(stderr, "Invalid arg index.\n");
-    break;
-  case CL_INVALID_ARG_VALUE:
-    fprintf(stderr, "Invalid arg value.\n");
-    break;
-  case CL_INVALID_ARG_SIZE:
-    fprintf(stderr, "Invalid arg size.\n");
-    break;
-  case CL_INVALID_KERNEL_ARGS:
-    fprintf(stderr, "Invalid kernel args.\n");
-    break;
-  case CL_INVALID_WORK_DIMENSION:
-    fprintf(stderr, "Invalid work dimension.\n");
-    break;
-  case CL_INVALID_WORK_GROUP_SIZE:
-    fprintf(stderr, "Invalid work group size.\n");
-    break;
-  case CL_INVALID_WORK_ITEM_SIZE:
-    fprintf(stderr, "Invalid work item size.\n");
-    break;
-  case CL_INVALID_GLOBAL_OFFSET:
-    fprintf(stderr, "Invalid global offset.\n");
-    break;
-  case CL_INVALID_EVENT_WAIT_LIST:
-    fprintf(stderr, "Invalid event wait list.\n");
-    break;
-  case CL_INVALID_EVENT:
-    fprintf(stderr, "Invalid event.\n");
-    break;
-  case CL_INVALID_OPERATION:
-    fprintf(stderr, "Invalid operation.\n");
-    break;
-  case CL_INVALID_GL_OBJECT:
-    fprintf(stderr, "Invalid GL object.\n");
-    break;
-  case CL_INVALID_BUFFER_SIZE:
-    fprintf(stderr, "Invalid buffer size.\n");
-    break;
-  case CL_INVALID_MIP_LEVEL:
-    fprintf(stderr, "Invalid mip level.\n");
-    break;
-  case CL_INVALID_GLOBAL_WORK_SIZE:
-    fprintf(stderr, "Invalid global work size.\n");
-    break;
-  case CL_INVALID_PROPERTY:
-    fprintf(stderr, "Invalid property.\n");
-    break;
-  case CL_INVALID_IMAGE_DESCRIPTOR:
-    fprintf(stderr, "Invalid image descriptor.\n");
-    break;
-  case CL_INVALID_COMPILER_OPTIONS:
-    fprintf(stderr, "Invalid compiler options.\n");
-    break;
-  case CL_INVALID_LINKER_OPTIONS:
-    fprintf(stderr, "Invalid linker options.\n");
-    break;
-  case CL_INVALID_DEVICE_PARTITION_COUNT:
-    fprintf(stderr, "Invalid device partition count.\n");
-    break;
-  case CL_INVALID_PIPE_SIZE:
-    fprintf(stderr, "Invalid pipe size.\n");
-    break;
-  case CL_INVALID_DEVICE_QUEUE:
-    fprintf(stderr, "Invalid device queue.\n");
-    break;
-
-  // NVIDIA specific error.
-  case -9999:
-    fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
-    break;
-
-  default:
-    fprintf(stderr, "Unknown error code!\n");
-    break;
-  }
-}
-
-#endif /* HAS_LIBOPENCL */
-/******************************************************************************/
-/*                                   CUDA                                     */
-/******************************************************************************/
-#ifdef HAS_LIBCUDART
-
-struct CUDAContextT {
   CUcontext Cuda;
 };
 
-struct CUDAKernelT {
+struct PollyGPUFunctionT {
   CUfunction Cuda;
   CUmodule CudaModule;
-  const char *BinaryString;
+  const char *PTXString;
 };
 
-struct CUDADevicePtrT {
+struct PollyGPUDevicePtrT {
   CUdeviceptr Cuda;
 };
 
@@ -906,10 +57,10 @@
 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
 
 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
-    CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
-    unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
-    unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
-    void **KernelParams, void **Extra);
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams, void **extra);
 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
 
 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
@@ -944,8 +95,8 @@
                                                  void **);
 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
 
-typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
-                                               const void *Image);
+typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
+                                               const void *image);
 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
 
 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
@@ -958,25 +109,25 @@
 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
 
-typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
-                                            CUjitInputType Type, void *Data,
-                                            size_t Size, const char *Name,
-                                            unsigned int NumOptions,
-                                            CUjit_option *Options,
-                                            void **OptionValues);
+typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
+                                            CUjitInputType type, void *data,
+                                            size_t size, const char *name,
+                                            unsigned int numOptions,
+                                            CUjit_option *options,
+                                            void **optionValues);
 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
 
-typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
-                                           CUjit_option *Options,
-                                           void **OptionValues,
-                                           CUlinkState *StateOut);
+typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
+                                           CUjit_option *options,
+                                           void **optionValues,
+                                           CUlinkState *stateOut);
 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
 
-typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
-                                             size_t *SizeOut);
+typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
+                                             size_t *sizeOut);
 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
 
-typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
+typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
 
 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
@@ -986,36 +137,36 @@
 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
 
-static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
+static void *getAPIHandle(void *Handle, const char *FuncName) {
   char *Err;
   void *FuncPtr;
   dlerror();
   FuncPtr = dlsym(Handle, FuncName);
   if ((Err = dlerror()) != 0) {
-    fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
+    fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
     return 0;
   }
   return FuncPtr;
 }
 
-static int initialDeviceAPILibrariesCUDA() {
+static int initialDeviceAPILibraries() {
   HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
   if (!HandleCuda) {
-    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
+    printf("Cannot open library: %s. \n", dlerror());
     return 0;
   }
 
   HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
   if (!HandleCudaRT) {
-    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
+    printf("Cannot open library: %s. \n", dlerror());
     return 0;
   }
 
   return 1;
 }
 
-static int initialDeviceAPIsCUDA() {
-  if (initialDeviceAPILibrariesCUDA() == 0)
+static int initialDeviceAPIs() {
+  if (initialDeviceAPILibraries() == 0)
     return 0;
 
   /* Get function pointer to CUDA Driver APIs.
@@ -1027,76 +178,77 @@
    * as it is valid on POSIX 2008.
    */
   CuLaunchKernelFcnPtr =
-      (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
+      (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");
 
   CuMemAllocFcnPtr =
-      (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
+      (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
 
-  CuMemFreeFcnPtr =
-      (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
+  CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2");
 
   CuMemcpyDtoHFcnPtr =
-      (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
+      (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");
 
   CuMemcpyHtoDFcnPtr =
-      (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
+      (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");
 
   CuModuleUnloadFcnPtr =
-      (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
+      (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");
 
   CuCtxDestroyFcnPtr =
-      (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
+      (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");
 
-  CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
+  CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");
 
   CuDeviceGetCountFcnPtr =
-      (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
+      (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount");
 
   CuDeviceGetFcnPtr =
-      (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
+      (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");
 
   CuCtxCreateFcnPtr =
-      (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
+      (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");
 
-  CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
-      HandleCuda, "cuModuleLoadDataEx");
+  CuModuleLoadDataExFcnPtr =
+      (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
 
   CuModuleLoadDataFcnPtr =
-      (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
+      (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
 
-  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
+  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
       HandleCuda, "cuModuleGetFunction");
 
   CuDeviceComputeCapabilityFcnPtr =
-      (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
+      (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
           HandleCuda, "cuDeviceComputeCapability");
 
   CuDeviceGetNameFcnPtr =
-      (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
+      (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
 
   CuLinkAddDataFcnPtr =
-      (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
+      (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
 
   CuLinkCreateFcnPtr =
-      (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
+      (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
 
   CuLinkCompleteFcnPtr =
-      (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
+      (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
 
   CuLinkDestroyFcnPtr =
-      (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
+      (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
 
   CuCtxSynchronizeFcnPtr =
-      (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
+      (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize");
 
   /* Get function pointer to CUDA Runtime APIs. */
-  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
+  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
       HandleCudaRT, "cudaThreadSynchronize");
 
   return 1;
 }
 
-static PollyGPUContext *initContextCUDA() {
+PollyGPUContext *polly_initContext() {
+  DebugMode = getenv("POLLY_DEBUG") != 0;
+
   dump_function();
   PollyGPUContext *Context;
   CUdevice Device;
@@ -1111,20 +263,20 @@
     return CurrentContext;
 
   /* Get API handles. */
-  if (initialDeviceAPIsCUDA() == 0) {
-    fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
+  if (initialDeviceAPIs() == 0) {
+    fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
     exit(-1);
   }
 
   if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
-    fprintf(stderr, "Initializing the CUDA driver API failed.\n");
+    fprintf(stdout, "Initializing the CUDA driver API failed.\n");
     exit(-1);
   }
 
   /* Get number of devices that supports CUDA. */
   CuDeviceGetCountFcnPtr(&DeviceCount);
   if (DeviceCount == 0) {
-    fprintf(stderr, "There is no device supporting CUDA.\n");
+    fprintf(stdout, "There is no device supporting CUDA.\n");
     exit(-1);
   }
 
@@ -1138,15 +290,12 @@
   /* Create context on the device. */
   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
   if (Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
+    fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
     exit(-1);
   }
-  Context->Context = malloc(sizeof(CUDAContext));
-  if (Context->Context == 0) {
-    fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
-    exit(-1);
-  }
-  CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device);
+  CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);
+
+  CacheMode = getenv("POLLY_NOCACHE") == 0;
 
   if (CacheMode)
     CurrentContext = Context;
@@ -1154,24 +303,18 @@
   return Context;
 }
 
-static void freeKernelCUDA(PollyGPUFunction *Kernel) {
-  dump_function();
-
-  if (CacheMode)
-    return;
-
-  if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
-    CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
-
-  if (Kernel->Kernel)
-    free((CUDAKernel *)Kernel->Kernel);
+static void freeKernel(PollyGPUFunction *Kernel) {
+  if (Kernel->CudaModule)
+    CuModuleUnloadFcnPtr(Kernel->CudaModule);
 
   if (Kernel)
     free(Kernel);
 }
 
-static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
-                                       const char *KernelName) {
+#define KERNEL_CACHE_SIZE 10
+
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+                                  const char *KernelName) {
   dump_function();
 
   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
@@ -1181,21 +324,16 @@
     // We exploit here the property that all Polly-ACC kernels are allocated
     // as global constants, hence a pointer comparision is sufficient to
     // determin equality.
-    if (KernelCache[i] &&
-        ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
+    if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
       debug_print("  -> using cached kernel\n");
       return KernelCache[i];
     }
   }
 
   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
+
   if (Function == 0) {
-    fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
-    exit(-1);
-  }
-  Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
-  if (Function->Kernel == 0) {
-    fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
+    fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
     exit(-1);
   }
 
@@ -1232,45 +370,43 @@
   memset(ErrorLog, 0, sizeof(ErrorLog));
 
   CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
-  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
-                            strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
+  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
+                            strlen(PTXBuffer) + 1, 0, 0, 0, 0);
   if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
+    fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
     exit(-1);
   }
 
   Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
   if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Complete ptx linker step failed.\n");
-    fprintf(stderr, "\n%s\n", ErrorLog);
+    fprintf(stdout, "Complete ptx linker step failed.\n");
+    fprintf(stdout, "\n%s\n", ErrorLog);
     exit(-1);
   }
 
   debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
               InfoLog);
 
-  Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
-                               CuOut);
+  Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
   if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Loading ptx assembly text failed.\n");
+    fprintf(stdout, "Loading ptx assembly text failed.\n");
     exit(-1);
   }
 
-  Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
-                                  ((CUDAKernel *)Function->Kernel)->CudaModule,
+  Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
                                   KernelName);
   if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Loading kernel function failed.\n");
+    fprintf(stdout, "Loading kernel function failed.\n");
     exit(-1);
   }
 
   CuLinkDestroyFcnPtr(LState);
 
-  ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
+  Function->PTXString = PTXBuffer;
 
   if (CacheMode) {
     if (KernelCache[NextCacheItem])
-      freeKernelCUDA(KernelCache[NextCacheItem]);
+      freeKernel(KernelCache[NextCacheItem]);
 
     KernelCache[NextCacheItem] = Function;
 
@@ -1280,37 +416,44 @@
   return Function;
 }
 
-static void synchronizeDeviceCUDA() {
+void polly_freeKernel(PollyGPUFunction *Kernel) {
   dump_function();
-  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
-    fprintf(stderr, "Synchronizing device and host memory failed.\n");
-    exit(-1);
-  }
+
+  if (CacheMode)
+    return;
+
+  freeKernel(Kernel);
 }
 
-static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
-                                     long MemSize) {
+void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
+                                long MemSize) {
   dump_function();
 
-  CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
+  CUdeviceptr CuDevData = DevData->Cuda;
   CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
 }
 
-static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
-                                     long MemSize) {
+void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
+                                long MemSize) {
   dump_function();
 
-  if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
-                         MemSize) != CUDA_SUCCESS) {
-    fprintf(stderr, "Copying results from device to host memory failed.\n");
+  if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
+    fprintf(stdout, "Copying results from device to host memory failed.\n");
+    exit(-1);
+  }
+}
+void polly_synchronizeDevice() {
+  dump_function();
+  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
+    fprintf(stdout, "Synchronizing device and host memory failed.\n");
     exit(-1);
   }
 }
 
-static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                             unsigned int GridDimY, unsigned int BlockDimX,
-                             unsigned int BlockDimY, unsigned int BlockDimZ,
-                             void **Parameters) {
+void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
+                        unsigned int GridDimY, unsigned int BlockDimX,
+                        unsigned int BlockDimY, unsigned int BlockDimZ,
+                        void **Parameters) {
   dump_function();
 
   unsigned GridDimZ = 1;
@@ -1319,290 +462,45 @@
   void **Extra = 0;
 
   CUresult Res;
-  Res =
-      CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
-                           GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
-                           SharedMemBytes, Stream, Parameters, Extra);
+  Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
+                             BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
+                             Stream, Parameters, Extra);
   if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Launching CUDA kernel failed.\n");
+    fprintf(stdout, "Launching CUDA kernel failed.\n");
     exit(-1);
   }
 }
 
-static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
+void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
   dump_function();
-  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
-  CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
-  free(DevPtr);
+  CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
   free(Allocation);
 }
 
-static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
+PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
   dump_function();
 
   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
+
   if (DevData == 0) {
-    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
-    exit(-1);
-  }
-  DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
-  if (DevData->DevicePtr == 0) {
-    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+    fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
     exit(-1);
   }
 
-  CUresult Res =
-      CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
+  CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);
 
   if (Res != CUDA_SUCCESS) {
-    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+    fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
     exit(-1);
   }
 
   return DevData;
 }
 
-static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
-  return (void *)DevPtr->Cuda;
-}
-
-static void freeContextCUDA(PollyGPUContext *Context) {
-  dump_function();
-
-  CUDAContext *Ctx = (CUDAContext *)Context->Context;
-  if (Ctx->Cuda) {
-    CuCtxDestroyFcnPtr(Ctx->Cuda);
-    free(Ctx);
-    free(Context);
-  }
-
-  dlclose(HandleCuda);
-  dlclose(HandleCudaRT);
-}
-
-#endif /* HAS_LIBCUDART */
-/******************************************************************************/
-/*                                    API                                     */
-/******************************************************************************/
-
-PollyGPUContext *polly_initContext() {
-  DebugMode = getenv("POLLY_DEBUG") != 0;
-  CacheMode = getenv("POLLY_NOCACHE") == 0;
-
-  dump_function();
-
-  PollyGPUContext *Context;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    Context = initContextCUDA();
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    Context = initContextCL();
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return Context;
-}
-
-void polly_freeKernel(PollyGPUFunction *Kernel) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    freeKernelCUDA(Kernel);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    freeKernelCL(Kernel);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
-                                  const char *KernelName) {
-  dump_function();
-
-  PollyGPUFunction *Function;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    Function = getKernelCUDA(BinaryBuffer, KernelName);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    Function = getKernelCL(BinaryBuffer, KernelName);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return Function;
-}
-
-void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
-                                long MemSize) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    copyFromHostToDeviceCL(HostData, DevData, MemSize);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
-                                long MemSize) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    copyFromDeviceToHostCL(DevData, HostData, MemSize);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                        unsigned int GridDimY, unsigned int BlockDimX,
-                        unsigned int BlockDimY, unsigned int BlockDimZ,
-                        void **Parameters) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
-                     BlockDimZ, Parameters);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
-                   Parameters);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    freeDeviceMemoryCUDA(Allocation);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    freeDeviceMemoryCL(Allocation);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-}
-
-PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
-  dump_function();
-
-  PollyGPUDevicePtr *DevData;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    DevData = allocateMemoryForDeviceCUDA(MemSize);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    DevData = allocateMemoryForDeviceCL(MemSize);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return DevData;
-}
-
 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
   dump_function();
 
-  void *DevPtr;
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    DevPtr = getDevicePtrCUDA(Allocation);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    DevPtr = getDevicePtrCL(Allocation);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
-
-  return DevPtr;
-}
-
-void polly_synchronizeDevice() {
-  dump_function();
-
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    synchronizeDeviceCUDA();
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    synchronizeDeviceCL();
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
-  }
+  return (void *)Allocation->Cuda;
 }
 
 void polly_freeContext(PollyGPUContext *Context) {
@@ -1611,40 +509,11 @@
   if (CacheMode)
     return;
 
-  switch (Runtime) {
-#ifdef HAS_LIBCUDART
-  case RUNTIME_CUDA:
-    freeContextCUDA(Context);
-    break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
-  case RUNTIME_CL:
-    freeContextCL(Context);
-    break;
-#endif /* HAS_LIBOPENCL */
-  default:
-    err_runtime();
+  if (Context->Cuda) {
+    CuCtxDestroyFcnPtr(Context->Cuda);
+    free(Context);
   }
-}
-
-/* Initialize GPUJIT with CUDA as runtime library. */
-PollyGPUContext *polly_initContextCUDA() {
-#ifdef HAS_LIBCUDART
-  Runtime = RUNTIME_CUDA;
-  return polly_initContext();
-#else
-  fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
-  exit(-1);
-#endif /* HAS_LIBCUDART */
-}
 
-/* Initialize GPUJIT with OpenCL as runtime library. */
-PollyGPUContext *polly_initContextCL() {
-#ifdef HAS_LIBOPENCL
-  Runtime = RUNTIME_CL;
-  return polly_initContext();
-#else
-  fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
-  exit(-1);
-#endif /* HAS_LIBOPENCL */
+  dlclose(HandleCuda);
+  dlclose(HandleCudaRT);
 }