Index: include/polly/CodeGen/PPCGCodeGeneration.h
===================================================================
--- include/polly/CodeGen/PPCGCodeGeneration.h
+++ include/polly/CodeGen/PPCGCodeGeneration.h
@@ -16,7 +16,7 @@
 #define POLLY_PPCGCODEGENERATION_H
 
 /// The GPU Architecture to target.
-enum GPUArch { NVPTX64, SPIR32, SPIR64 };
+enum GPUArch { NVPTX64, SPIR32, SPIR64, AMDGCN64 };
 
 /// The GPU Runtime implementation to use.
 enum GPURuntime { CUDA, OpenCL };
Index: lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- lib/CodeGen/PPCGCodeGeneration.cpp
+++ lib/CodeGen/PPCGCodeGeneration.cpp
@@ -53,6 +53,8 @@
 
 #include "llvm/Support/Debug.h"
 
+#include <fstream>
+
 using namespace polly;
 using namespace llvm;
 
@@ -118,6 +120,11 @@
                 cl::desc("The CUDA version to compile for"), cl::Hidden,
                 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::opt<std::string>
+    AMDArch("polly-acc-amd-arch",
+            cl::desc("The AMD architecture version to compile for"), cl::Hidden,
+            cl::init("gfx803"), cl::ZeroOrMore, cl::cat(PollyCategory));
+
 static cl::opt<int>
     MinCompute("polly-acc-mincompute",
                cl::desc("Minimal number of compute statements to run on GPU."),
@@ -715,10 +722,11 @@
   /// Create a call to get a kernel from an assembly string.
   ///
   /// @param Buffer The string describing the kernel.
+  /// @param Size   The size of the string describing the kernel.
   /// @param Entry  The name of the kernel function to call.
   ///
   /// @returns A pointer to a kernel object
-  Value *createCallGetKernel(Value *Buffer, Value *Entry);
+  Value *createCallGetKernel(Value *Buffer, Value *Size, Value *Entry);
 
   /// Create a call to free a GPU kernel.
   ///
@@ -735,10 +743,13 @@
   /// @param GridBlockZ The size of the third block dimension.
   /// @param Parameters A pointer to an array that contains itself pointers to
   ///                   the parameter values passed for each kernel argument.
+  /// @param CLUseLocalWorkSize A boolean dictating whether the OpenCL Runtime
+  ///                           should automatically determine the Local work
+  ///                           group size or use the provided dimensions.
   void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
                               Value *GridDimY, Value *BlockDimX,
                               Value *BlockDimY, Value *BlockDimZ,
-                              Value *Parameters);
+                              Value *Parameters, Value *CLUseLocalWorkSize);
 };
 
 std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) {
@@ -861,7 +872,8 @@
     createCallFreeDeviceMemory(Array.second);
 }
 
-Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
+Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Size,
+                                           Value *Entry) {
   const char *Name = "polly_getKernel";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
@@ -871,12 +883,13 @@
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
     std::vector<Type *> Args;
     Args.push_back(Builder.getInt8PtrTy());
+    Args.push_back(Builder.getInt64Ty());
     Args.push_back(Builder.getInt8PtrTy());
     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
     F = Function::Create(Ty, Linkage, Name, M);
   }
 
-  return Builder.CreateCall(F, {Buffer, Entry});
+  return Builder.CreateCall(F, {Buffer, Size, Entry});
 }
 
 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
@@ -899,7 +912,8 @@
 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
                                             Value *GridDimY, Value *BlockDimX,
                                             Value *BlockDimY, Value *BlockDimZ,
-                                            Value *Parameters) {
+                                            Value *Parameters,
+                                            Value *CLUseLocalWorkSize) {
   const char *Name = "polly_launchKernel";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
@@ -915,12 +929,13 @@
     Args.push_back(Builder.getInt32Ty());
     Args.push_back(Builder.getInt32Ty());
     Args.push_back(Builder.getInt8PtrTy());
+    Args.push_back(Builder.getInt32Ty());
     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
     F = Function::Create(Ty, Linkage, Name, M);
   }
 
   Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
-                         BlockDimZ, Parameters});
+                         BlockDimZ, Parameters, CLUseLocalWorkSize});
 }
 
 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
@@ -1342,6 +1357,9 @@
   case GPUArch::NVPTX64:
     Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
     break;
+  case GPUArch::AMDGCN64:
+    Sync = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_s_barrier);
+    break;
   }
 
   Builder.CreateCall(Sync, {});
@@ -1833,14 +1851,28 @@
 
   std::string Name = getKernelFuncName(Kernel->id);
   Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
+  Value *KernelSize = Builder.getInt64(ASMString.length() + 1);
   Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
-  Value *GPUKernel = createCallGetKernel(KernelString, NameString);
+  Value *GPUKernel = createCallGetKernel(KernelString, KernelSize, NameString);
 
   Value *GridDimX, *GridDimY;
   std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
 
+  Value *CLUseLocalWorkSize;
+
+  switch (Arch) {
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    CLUseLocalWorkSize = Builder.getInt32(1);
+    break;
+  case GPUArch::AMDGCN64:
+  case GPUArch::NVPTX64:
+    CLUseLocalWorkSize = Builder.getInt32(0);
+    break;
+  }
+
   createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
-                         BlockDimZ, Parameters);
+                         BlockDimZ, Parameters, CLUseLocalWorkSize);
   createCallFreeKernel(GPUKernel);
 
   for (auto Id : KernelIds)
@@ -1868,6 +1900,13 @@
   return Ret;
 }
 
+/// Compute the DataLayout string for the AMDGPU backend.
+static std::string computeAMDGCNDataLayout() {
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+}
+
 /// Compute the DataLayout string for a SPIR kernel.
 ///
 /// @param is64Bit Are we looking for a 64 bit architecture?
@@ -1969,6 +2008,9 @@
   case GPUArch::NVPTX64:
     FN->setCallingConv(CallingConv::PTX_Kernel);
     break;
+  case GPUArch::AMDGCN64:
+    FN->setCallingConv(CallingConv::AMDGPU_KERNEL);
+    break;
   case GPUArch::SPIR32:
   case GPUArch::SPIR64:
     FN->setCallingConv(CallingConv::SPIR_KERNEL);
@@ -2050,6 +2092,14 @@
     IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
     IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
     break;
+  case GPUArch::AMDGCN64:
+    IntrinsicsBID[0] = Intrinsic::amdgcn_workitem_id_x;
+    IntrinsicsBID[1] = Intrinsic::amdgcn_workitem_id_y;
+
+    IntrinsicsTID[0] = Intrinsic::amdgcn_workgroup_id_x;
+    IntrinsicsTID[1] = Intrinsic::amdgcn_workgroup_id_y;
+    IntrinsicsTID[2] = Intrinsic::amdgcn_workgroup_id_z;
+    break;
   }
 
   auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
@@ -2254,6 +2304,13 @@
       GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
     GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
     break;
+  case GPUArch::AMDGCN64:
+    if (Runtime == GPURuntime::CUDA)
+      llvm_unreachable("Cannot generate AMD code for CUDA runtime");
+    else if (Runtime == GPURuntime::OpenCL)
+      GPUModule->setTargetTriple(Triple::normalize("amdgcn-amd-amdhsa-opencl"));
+    GPUModule->setDataLayout(computeAMDGCNDataLayout());
+    break;
   case GPUArch::SPIR32:
     GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
     GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
@@ -2281,6 +2338,7 @@
   createKernelVariables(Kernel, FN);
 
   switch (Arch) {
+  case GPUArch::AMDGCN64:
   case GPUArch::NVPTX64:
     insertKernelIntrinsics(Kernel);
     break;
@@ -2305,6 +2363,16 @@
       break;
     }
     break;
+  case GPUArch::AMDGCN64:
+    switch (Runtime) {
+    case GPURuntime::CUDA:
+      llvm_unreachable("Cannot generate AMD code for CUDA runtime");
+      break;
+    case GPURuntime::OpenCL:
+      GPUTriple = llvm::Triple(Triple::normalize("amdgcn-amd-amdhsa-opencl"));
+      break;
+    }
+    break;
   case GPUArch::SPIR64:
   case GPUArch::SPIR32:
     std::string SPIRAssembly;
@@ -2331,6 +2399,9 @@
   case GPUArch::NVPTX64:
     subtarget = CudaVersion;
     break;
+  case GPUArch::AMDGCN64:
+    subtarget = AMDArch;
+    break;
   case GPUArch::SPIR32:
   case GPUArch::SPIR64:
     llvm_unreachable("No subtarget for SPIR architecture");
@@ -2345,15 +2416,49 @@
 
   PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
 
-  if (TargetM->addPassesToEmitFile(
-          PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) {
-    errs() << "The target does not support generation of this file type!\n";
-    return "";
+  if (Arch == GPUArch::AMDGCN64) {
+    if (TargetM->addPassesToEmitFile(
+            PM, ASMStream, TargetMachine::CGFT_ObjectFile, true /* verify */)) {
+      errs() << "The target does not support generation of this file type!\n";
+      return "";
+    }
+  } else {
+    if (TargetM->addPassesToEmitFile(PM, ASMStream,
+                                     TargetMachine::CGFT_AssemblyFile,
+                                     true /* verify */)) {
+      errs() << "The target does not support generation of this file type!\n";
+      return "";
+    }
   }
 
   PM.run(*GPUModule);
 
-  return ASMStream.str();
+  std::string Assembly = ASMStream.str();
+
+  if (Arch == GPUArch::AMDGCN64) {
+    char InFileName[] = "/tmp/polly_kernelXXXXXX";
+    char OutFileName[] = "/tmp/polly_kernelObjXXXXXX";
+    mkstemp(InFileName);
+    mkstemp(OutFileName);
+
+    std::ofstream ASMFile;
+    ASMFile.open(InFileName);
+    ASMFile << Assembly;
+    ASMFile.close();
+
+    std::string LLDCommand = "ld.lld -shared " + std::string(InFileName) +
+                             " -o " + std::string(OutFileName);
+    system(LLDCommand.c_str());
+
+    std::ifstream ifs(OutFileName);
+    Assembly.assign((std::istreambuf_iterator<char>(ifs)),
+                    (std::istreambuf_iterator<char>()));
+
+    remove(InFileName);
+    remove(OutFileName);
+  }
+
+  return Assembly;
 }
 
 bool GPUNodeBuilder::requiresCUDALibDevice() {
Index: lib/Support/RegisterPasses.cpp
===================================================================
--- lib/Support/RegisterPasses.cpp
+++ lib/Support/RegisterPasses.cpp
@@ -124,6 +124,8 @@
     GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
                   cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
                                         "target NVIDIA 64-bit architecture"),
+                             clEnumValN(GPUArch::AMDGCN64, "amdgcn64",
+                                        "target AMD GCN 64-bit architecture"),
                              clEnumValN(GPUArch::SPIR32, "spir32",
                                         "target SPIR 32-bit architecture"),
                              clEnumValN(GPUArch::SPIR64, "spir64",
Index: test/GPGPU/add-scalars-in-scop-to-kills.ll
===================================================================
--- test/GPGPU/add-scalars-in-scop-to-kills.ll
+++ test/GPGPU/add-scalars-in-scop-to-kills.ll
@@ -13,7 +13,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 ; Check that we add variables that are local to a scop into the kills that we
 ; pass to PPCG. This should enable PPCG to codegen this example.
Index: test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
===================================================================
--- test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
+++ test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
@@ -19,7 +19,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 ; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
 
 ; C pseudocode
Index: test/GPGPU/cuda-managed-memory-simple.ll
===================================================================
--- test/GPGPU/cuda-managed-memory-simple.ll
+++ test/GPGPU/cuda-managed-memory-simple.ll
@@ -46,8 +46,8 @@
 ; CHECK-NEXT:  store i8* %15, i8** %polly_launch_0_param_1
 ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
 ; CHECK-NEXT:  store i8* %19, i8** %18
-; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
-; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
+; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i64 852, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
+; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr, i32 0)
 ; CHECK-NEXT:  call void @polly_freeKernel(i8* %20)
 ; CHECK-NEXT:  call void @polly_synchronizeDevice()
 ; CHECK-NEXT:  call void @polly_freeContext(i8* %13)
Index: test/GPGPU/intrinsic-copied-into-kernel.ll
===================================================================
--- test/GPGPU/intrinsic-copied-into-kernel.ll
+++ test/GPGPU/intrinsic-copied-into-kernel.ll
@@ -18,7 +18,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 
 ; void f(float *A, float *B, int N) {
Index: test/GPGPU/invariant-load-array-access.ll
===================================================================
--- test/GPGPU/invariant-load-array-access.ll
+++ test/GPGPU/invariant-load-array-access.ll
@@ -22,7 +22,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 ; This test makes sure that such an access pattern is handled correctly
 ; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
Index: test/GPGPU/invariant-load-hoisting-of-array.ll
===================================================================
--- test/GPGPU/invariant-load-hoisting-of-array.ll
+++ test/GPGPU/invariant-load-hoisting-of-array.ll
@@ -54,7 +54,7 @@
 
 ; Check that the kernel launch is generated in the host IR.
 ; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 
 ; C pseudocode equivalent
Index: test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll
===================================================================
--- test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll
+++ test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll
@@ -22,7 +22,7 @@
 
 ; Check that the kernel launch is generated in the host IR.
 ; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 ;    void f(int *begin, int *end, int *arr) {
 ;      for (int i = *begin; i < *end; i++) {
Index: test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
===================================================================
--- test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
+++ test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
@@ -19,7 +19,7 @@
 
 ; Check that the kernel launch is generated in the host IR.
 ; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 ; 
 ; void f(int *begin, int *arr) {
Index: test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll
===================================================================
--- test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll
+++ test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll
@@ -15,7 +15,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 ; Check if we generate GPU code for simple loop with variable upper bound.
 ; This always worked, but have this test to prevent regressions.
Index: test/GPGPU/invariant-load-hoisting.ll
===================================================================
--- test/GPGPU/invariant-load-hoisting.ll
+++ test/GPGPU/invariant-load-hoisting.ll
@@ -17,7 +17,7 @@
 ; SCOP-NEXT:              [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] };
 ; SCOP-NEXT:          Execution Context: [n, tmp12] -> {  : n > 0 }
 ; SCOP-NEXT:  }
-; HOST-IR:      call void @polly_launchKernel(i8* %209, i32 %215, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
+; HOST-IR:      call void @polly_launchKernel(i8* %209, i32 %215, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr, i32 0)
 ; HOST-IR-NEXT: call void @polly_freeKernel(i8* %209)
 
 ; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge)
Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll
===================================================================
--- test/GPGPU/libdevice-functions-copied-into-kernel.ll
+++ test/GPGPU/libdevice-functions-copied-into-kernel.ll
@@ -24,7 +24,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 
 ; void f(float *A, float *B, int N) {
Index: test/GPGPU/privatization-simple.ll
===================================================================
--- test/GPGPU/privatization-simple.ll
+++ test/GPGPU/privatization-simple.ll
@@ -9,7 +9,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 ; void f(int A[], int B[], int control, int C[]) {
 ;     int x;
Index: test/GPGPU/privatization.ll
===================================================================
--- test/GPGPU/privatization.ll
+++ test/GPGPU/privatization.ll
@@ -10,7 +10,7 @@
 
 ; Check that kernel launch is generated in host IR.
 ; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32)
 
 ; 
 ;
Index: tools/GPURuntime/GPUJIT.h
===================================================================
--- tools/GPURuntime/GPUJIT.h
+++ tools/GPURuntime/GPUJIT.h
@@ -50,6 +50,7 @@
  *   PollyGPUDevicePtr *DevArray;
  *   int *HostData;
  *   int MemSize;
+ *   size_t KernelSize = strlen(KernelString);
  *
  *   int GridX = 8;
  *   int GridY = 8;
@@ -61,7 +62,7 @@
  *   MemSize = 256*64*sizeof(int);
  *   Context = polly_initContext();
  *   DevArray = polly_allocateMemoryForDevice(MemSize);
- *   Kernel = polly_getKernel(KernelString, KernelName);
+ *   Kernel = polly_getKernel(KernelString, KernelSize, KernelName);
  *
  *   void *Params[1];
  *   void *DevPtr = polly_getDevicePtr(DevArray)
@@ -98,6 +99,7 @@
 PollyGPUContext *polly_initContextCUDA();
 PollyGPUContext *polly_initContextCL();
 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
+                                  const size_t BinarySize,
                                   const char *KernelName);
 void polly_freeKernel(PollyGPUFunction *Kernel);
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
@@ -108,7 +110,7 @@
 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
                         unsigned int GridDimY, unsigned int BlockSizeX,
                         unsigned int BlockSizeY, unsigned int BlockSizeZ,
-                        void **Parameters);
+                        void **Parameters, unsigned int CLUseLocalWorkSize);
 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
 void polly_freeContext(PollyGPUContext *Context);
 
Index: tools/GPURuntime/GPUJIT.c
===================================================================
--- tools/GPURuntime/GPUJIT.c
+++ tools/GPURuntime/GPUJIT.c
@@ -465,6 +465,7 @@
 }
 
 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
+                                     const size_t BinarySize,
                                      const char *KernelName) {
   dump_function();
 
@@ -523,7 +524,6 @@
     checkOpenCLError(Ret, "Failed to create program from llvm.\n");
     unlink("kernel.ll");
   } else {
-    size_t BinarySize = strlen(BinaryBuffer);
     ((OpenCLKernel *)Function->Kernel)->Program =
         clCreateProgramWithBinaryFcnPtr(
             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
@@ -591,7 +591,7 @@
 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
                            unsigned int GridDimY, unsigned int BlockDimX,
                            unsigned int BlockDimY, unsigned int BlockDimZ,
-                           void **Parameters) {
+                           void **Parameters, unsigned int UseLocalWorkSize) {
   dump_function();
 
   cl_int Ret;
@@ -622,9 +622,14 @@
 
   static const int WorkDim = 3;
   OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
-  Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
-                                     WorkDim, NULL, GlobalWorkSize,
-                                     LocalWorkSize, 0, NULL, NULL);
+  if (UseLocalWorkSize)
+    Ret = clEnqueueNDRangeKernelFcnPtr(
+        CLContext->CommandQueue, CLKernel->Kernel, WorkDim, NULL,
+        GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
+  else
+    Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue,
+                                       CLKernel->Kernel, WorkDim, NULL,
+                                       GlobalWorkSize, NULL, 0, NULL, NULL);
   checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
 }
 
@@ -1619,6 +1624,7 @@
 }
 
 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
+                                  const size_t BinarySize,
                                   const char *KernelName) {
   dump_function();
 
@@ -1632,7 +1638,7 @@
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
-    Function = getKernelCL(BinaryBuffer, KernelName);
+    Function = getKernelCL(BinaryBuffer, BinarySize, KernelName);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
@@ -1685,7 +1691,7 @@
 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
                         unsigned int GridDimY, unsigned int BlockDimX,
                         unsigned int BlockDimY, unsigned int BlockDimZ,
-                        void **Parameters) {
+                        void **Parameters, unsigned int CLUseLocalWorkSize) {
   dump_function();
 
   switch (Runtime) {
@@ -1698,7 +1704,7 @@
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
-                   Parameters);
+                   Parameters, CLUseLocalWorkSize);
     break;
 #endif /* HAS_LIBOPENCL */
   default: