Index: tools/GPURuntime/GPUJIT.h
===================================================================
--- tools/GPURuntime/GPUJIT.h
+++ tools/GPURuntime/GPUJIT.h
@@ -13,6 +13,7 @@
 
 #ifndef GPUJIT_H_
 #define GPUJIT_H_
+#include "stddef.h"
 
 /*
  * The following demostrates how we can use the GPURuntime library to
@@ -110,4 +111,13 @@
                         void **Parameters);
 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
 void polly_freeContext(PollyGPUContext *Context);
+    
+// Note that polly_{malloc/free}Managed are currently not used by Polly.
+// We use them in COSMO by replacing all malloc with mallocManaged and all
+// frees with cudaFree, so we can get managed memory "automatically".
+// Needless to say, this is a hack.
+// Please make sure that this code is not present in Polly when 2018 rolls in.
+// If this is still present, ping Siddharth Bhat <siddu.druid@gmail.com>
+void *polly_mallocManaged(size_t size);
+void polly_freeManaged(void *mem);
 #endif /* GPUJIT_H_ */
Index: tools/GPURuntime/GPUJIT.c
===================================================================
--- tools/GPURuntime/GPUJIT.c
+++ tools/GPURuntime/GPUJIT.c
@@ -32,8 +32,8 @@
 #include <string.h>
 #include <unistd.h>
 
-static int DebugMode;
-static int CacheMode;
+static int DebugMode = 0;
+static int CacheMode = 1;
 
 static PollyGPURuntime Runtime = RUNTIME_NONE;
 
@@ -1188,7 +1188,29 @@
     fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
     exit(-1);
   }
-  CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device);
+
+  // In cases where managed memory is used, it is quite likely that
+  // `cudaMallocManaged` / `polly_mallocManaged` was called before
+  // `polly_initContext` was called.
+  //
+  // If `polly_initContext` calls `CuCtxCreate` when there already was a
+  // pre-existing context created by the runtime API, this causes code running
+  // on P100 to hang. So, we query for a pre-existing context to try and use.
+  // If there is no pre-existing context, we create a new context
+
+  // The possible pre-existing context from previous runtime API calls.
+  CUcontext MaybeRuntimeAPIContext;
+  if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
+    fprintf(stderr, "cuCtxGetCurrent failed");
+    exit(-1);
+  }
+
+  // There was no previous context, initialise it.
+  if (MaybeRuntimeAPIContext == NULL) {
+    CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device);
+  } else {
+    ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
+  }
 
   if (CacheMode)
     CurrentContext = Context;
@@ -1371,6 +1393,56 @@
   }
 }
 
+// Maximum number of managed memory pointers.
+#define MAX_POINTERS 4000
+void *g_managedptrs[MAX_POINTERS];
+int g_nmanagedptrs = 0;
+
+// Add a pointer as being allocated by cuMallocManaged
+void addManagedPtr(void *mem) {
+  assert(g_nmanagedptrs < MAX_POINTERS);
+  g_managedptrs[g_nmanagedptrs++] = mem;
+}
+
+int isManagedPtr(void *mem) {
+  for (int i = 0; i < g_nmanagedptrs; i++) {
+    if (g_managedptrs[i] == mem)
+      return 1;
+  }
+  return 0;
+}
+
+void polly_freeManaged(void *mem) {
+  dump_function();
+
+  // Something extremely weird goes on in the COSMO code: we get calls
+  // to free memory that have not been `malloc`d. Since we replace all free
+  // with `cudaFreee`, sometimes we get calls to free to memory we have not
+  // even mallocd.
+  // So, keep a free list and check if this is one of "our" pointers. If it is,
+  // call `cudaFree`. If not, pass it along to the underlying allocator.
+  if (isManagedPtr(mem)) {
+    if (cudaFree(mem) != cudaSuccess) {
+      fprintf(stderr, "cudaFree failed.\n");
+      exit(-1);
+    }
+    return;
+  } else {
+    free(mem);
+  }
+}
+
+void *polly_mallocManaged(size_t size) {
+  dump_function();
+  void *a;
+  if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) {
+    fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
+    exit(-1);
+  }
+  addManagedPtr(a);
+  return a;
+}
+
 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
   dump_function();
   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;