Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt
+++ CMakeLists.txt
@@ -172,6 +172,11 @@
   INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
 endif(OpenCL_FOUND)
 
+option(USE_INTEL_OCL "Uses the Intel Beignet driver for GPGPU code" OFF)
+if (USE_INTEL_OCL)
+  add_definitions(-DHAS_INTEL_OCL)
+endif(USE_INTEL_OCL)
+
 option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
 if (NOT POLLY_BUNDLED_ISL)
   find_package(ISL MODULE REQUIRED)
Index: include/polly/CodeGen/PPCGCodeGeneration.h
===================================================================
--- include/polly/CodeGen/PPCGCodeGeneration.h
+++ include/polly/CodeGen/PPCGCodeGeneration.h
@@ -16,7 +16,7 @@
 #define POLLY_PPCGCODEGENERATION_H
 
 /// The GPU Architecture to target.
-enum GPUArch { NVPTX64 };
+enum GPUArch { NVPTX64, SPIR32, SPIR64 };
 
 /// The GPU Runtime implementation to use.
 enum GPURuntime { CUDA, OpenCL };
Index: lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- lib/CodeGen/PPCGCodeGeneration.cpp
+++ lib/CodeGen/PPCGCodeGeneration.cpp
@@ -37,6 +37,8 @@
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
+#include <regex>
+
 #include "isl/union_map.h"
 
 extern "C" {
@@ -570,6 +572,11 @@
   /// @returns A string containing the corresponding PTX assembly code.
   std::string createKernelASM();
 
+  /// Create a SPIR string for the current GPU kernel.
+  ///
+  /// @returns A string containing the corresponding  SPIR code.
+  std::string createKernelSPIR(std::string IR);
+
   /// Remove references from the dominator tree to the kernel function @p F.
   ///
   /// @param F The function to remove references to.
@@ -1230,10 +1237,24 @@
 
 void GPUNodeBuilder::createKernelSync() {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  const char *SpirName = "__gen_ocl_barrier_global";
 
   Function *Sync;
 
   switch (Arch) {
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    Sync = M->getFunction(SpirName);
+
+    // If Sync is not available, declare it.
+    if (!Sync) {
+      GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+      std::vector<Type *> Args;
+      FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+      Sync = Function::Create(Ty, Linkage, SpirName, M);
+      Sync->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+    break;
   case GPUArch::NVPTX64:
     Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
     break;
@@ -1629,7 +1650,8 @@
 
   finalizeKernelArguments(Kernel);
   Function *F = Builder.GetInsertBlock()->getParent();
-  addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
+  if (Arch == GPUArch::NVPTX64)
+    addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
   clearDominators(F);
   clearScalarEvolution(F);
   clearLoops(F);
@@ -1686,12 +1708,35 @@
   return Ret;
 }
 
+/// Compute the DataLayout string for a SPIR kernel.
+///
+/// @param is64Bit Are we looking for a 64 bit architecture?
+static std::string computeSPIRDataLayout(bool is64Bit) {
+  std::string Ret = "";
+
+  if (!is64Bit) {
+    Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+           "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+  } else {
+    Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+           "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+  }
+
+  return Ret;
+}
+
 Function *
 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
                                          SetVector<Value *> &SubtreeValues) {
   std::vector<Type *> Args;
   std::string Identifier = getKernelFuncName(Kernel->id);
 
+  std::vector<Metadata *> MemoryType;
+
   for (long i = 0; i < Prog->n_array; i++) {
     if (!ppcg_kernel_requires_array_argument(Kernel, i))
       continue;
@@ -1700,16 +1745,23 @@
       isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
       const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
       Args.push_back(SAI->getElementType());
+      MemoryType.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
     } else {
       static const int UseGlobalMemory = 1;
       Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
+      MemoryType.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
     }
   }
 
   int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
 
-  for (long i = 0; i < NumHostIters; i++)
+  for (long i = 0; i < NumHostIters; i++) {
     Args.push_back(Builder.getInt64Ty());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+  }
 
   int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
 
@@ -1718,19 +1770,49 @@
     Value *Val = IDToValue[Id];
     isl_id_free(Id);
     Args.push_back(Val->getType());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
   }
 
-  for (auto *V : SubtreeValues)
+  for (auto *V : SubtreeValues) {
     Args.push_back(V->getType());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+  }
 
   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                               GPUModule.get());
 
+  std::vector<Metadata *> EmptyStrings;
+
+  for (unsigned int i = 0; i < MemoryType.size(); i++) {
+    EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
+  }
+
+  if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
+    FN->setMetadata("kernel_arg_addr_space",
+                    MDNode::get(FN->getContext(), MemoryType));
+    FN->setMetadata("kernel_arg_name",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_access_qual",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_type",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_type_qual",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_base_type",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+  }
+
   switch (Arch) {
   case GPUArch::NVPTX64:
     FN->setCallingConv(CallingConv::PTX_Kernel);
     break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    FN->setCallingConv(CallingConv::SPIR_KERNEL);
+    break;
   }
 
   auto Arg = FN->arg_begin();
@@ -1796,6 +1878,8 @@
   Intrinsic::ID IntrinsicsTID[3];
 
   switch (Arch) {
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
   case GPUArch::NVPTX64:
     IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
     IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
@@ -1965,6 +2049,14 @@
       GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
     GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
     break;
+  case GPUArch::SPIR32:
+    GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
+    GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
+    break;
+  case GPUArch::SPIR64:
+    GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
+    GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
+    break;
   }
 
   Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
@@ -1999,6 +2091,10 @@
       break;
     }
     break;
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    llvm_unreachable("Cannot generate ASM for SPIR architecture");
+    break;
   }
 
   std::string ErrMsg;
@@ -2018,6 +2114,10 @@
   case GPUArch::NVPTX64:
     subtarget = CudaVersion;
     break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    llvm_unreachable("No subtarget for SPIR architecture");
+    break;
   }
 
   std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
@@ -2040,6 +2140,43 @@
   return ASMStream.str();
 }
 
+std::string StringReplace(std::string const &in, std::string const &replace,
+                          std::string const &with) {
+  return std::regex_replace(in, std::regex(replace), with);
+}
+
+std::string GPUNodeBuilder::createKernelSPIR(std::string IR) {
+  IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.tid.x\\(\\)",
+                     "declare spir_func i32 @__gen_ocl_get_local_id0()");
+  IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.tid.y\\(\\)",
+                     "declare spir_func i32 @__gen_ocl_get_local_id1()");
+  IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.tid.z\\(\\)",
+                     "declare spir_func i32 @__gen_ocl_get_local_id2()");
+
+  IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x\\(\\)",
+                     "declare spir_func i32 @__gen_ocl_get_group_id0()");
+  IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y\\(\\)",
+                     "declare spir_func i32 @__gen_ocl_get_group_id1()");
+  IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z\\(\\)",
+                     "declare spir_func i32 @__gen_ocl_get_group_id2()");
+
+  IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.tid.x\\(\\)",
+                     "call spir_func i32 @__gen_ocl_get_local_id0()");
+  IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.tid.y\\(\\)",
+                     "call spir_func i32 @__gen_ocl_get_local_id1()");
+  IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.tid.z\\(\\)",
+                     "call spir_func i32 @__gen_ocl_get_local_id2()");
+
+  IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x\\(\\)",
+                     "call spir_func i32 @__gen_ocl_get_group_id0()");
+  IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y\\(\\)",
+                     "call spir_func i32 @__gen_ocl_get_group_id1()");
+  IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z\\(\\)",
+                     "call spir_func i32 @__gen_ocl_get_group_id2()");
+
+  return IR;
+}
+
 std::string GPUNodeBuilder::finalizeKernelFunction() {
 
   if (verifyModule(*GPUModule)) {
@@ -2056,15 +2193,27 @@
   if (DumpKernelIR)
     outs() << *GPUModule << "\n";
 
-  // Optimize module.
-  llvm::legacy::PassManager OptPasses;
-  PassManagerBuilder PassBuilder;
-  PassBuilder.OptLevel = 3;
-  PassBuilder.SizeLevel = 0;
-  PassBuilder.populateModulePassManager(OptPasses);
-  OptPasses.run(*GPUModule);
+  if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
+    // Optimize module.
+    llvm::legacy::PassManager OptPasses;
+    PassManagerBuilder PassBuilder;
+    PassBuilder.OptLevel = 3;
+    PassBuilder.SizeLevel = 0;
+    PassBuilder.populateModulePassManager(OptPasses);
+    OptPasses.run(*GPUModule);
+  }
+
+  std::string Assembly;
 
-  std::string Assembly = createKernelASM();
+  if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
+    std::string IR;
+    raw_string_ostream IROstream(IR);
+    IROstream << *GPUModule;
+    IROstream.flush();
+    Assembly = createKernelSPIR(IR);
+  } else {
+    Assembly = createKernelASM();
+  }
 
   if (DumpKernelASM)
     outs() << Assembly << "\n";
Index: lib/Support/RegisterPasses.cpp
===================================================================
--- lib/Support/RegisterPasses.cpp
+++ lib/Support/RegisterPasses.cpp
@@ -117,7 +117,11 @@
 static cl::opt<GPUArch>
     GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
                   cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
-                                        "target NVIDIA 64-bit architecture")),
+                                        "target NVIDIA 64-bit architecture"),
+                             clEnumValN(GPUArch::SPIR32, "spir32",
+                                        "target SPIR 32-bit architecture"),
+                             clEnumValN(GPUArch::SPIR64, "spir64",
+                                        "target SPIR 64-bit architecture")),
                   cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
                   cl::cat(PollyCategory));
 #endif
Index: tools/GPURuntime/GPUJIT.c
===================================================================
--- tools/GPURuntime/GPUJIT.c
+++ tools/GPURuntime/GPUJIT.c
@@ -22,14 +22,19 @@
 #ifdef __APPLE__
 #include <OpenCL/opencl.h>
 #else
+#ifdef HAS_INTEL_OCL
+#include <CL/cl_intel.h>
+#else
 #include <CL/cl.h>
-#endif
+#endif /* HAS_INTEL_OCL */
+#endif /* __APPLE__ */
 #endif /* HAS_LIBOPENCL */
 
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 
 static int DebugMode;
 static int CacheMode;
@@ -139,6 +144,12 @@
                           const cl_event *EventWaitList, cl_event *Event);
 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
 
+typedef cl_program
+clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
+                                  const cl_device_id *DeviceList,
+                                  const char *Filename, cl_int *ErrcodeRet);
+static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
+
 typedef cl_program clCreateProgramWithBinaryFcnTy(
     cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
     const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
@@ -210,7 +221,11 @@
 }
 
 static int initialDeviceAPILibrariesCL() {
+#ifdef HAS_INTEL_OCL
+  HandleOpenCL = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
+#else
   HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
+#endif /* HAS_INTEL_OCL */
   if (!HandleOpenCL) {
     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
     return 0;
@@ -261,6 +276,10 @@
   clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
       HandleOpenCL, "clEnqueueWriteBuffer");
 
+  clCreateProgramWithLLVMIntelFcnPtr =
+      (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
+          HandleOpenCL, "clCreateProgramWithLLVMIntel");
+
   clCreateProgramWithBinaryFcnPtr =
       (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
           HandleOpenCL, "clCreateProgramWithBinary");
@@ -481,12 +500,28 @@
   }
 
   cl_int Ret;
+
+#ifdef HAS_INTEL_OCL
+  FILE *fp = fopen("kernel.ll", "wb");
+  if (fp != NULL) {
+    fputs(BinaryBuffer, fp);
+    fclose(fp);
+  }
+
+  ((OpenCLKernel *)Function->Kernel)->Program =
+      clCreateProgramWithLLVMIntelFcnPtr(
+          ((OpenCLContext *)GlobalContext->Context)->Context, 1,
+          &GlobalDeviceID, "kernel.ll", &Ret);
+  checkOpenCLError(Ret, "Failed to create program from llvm.\n");
+  unlink("kernel.ll");
+#else
   size_t BinarySize = strlen(BinaryBuffer);
   ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
       ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
       (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
       &Ret);
   checkOpenCLError(Ret, "Failed to create program from binary.\n");
+#endif /* HAS_INTEL_OCL */
 
   Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
                              &GlobalDeviceID, NULL, NULL, NULL);