Index: openmp/libomptarget/include/omptarget.h
===================================================================
--- openmp/libomptarget/include/omptarget.h
+++ openmp/libomptarget/include/omptarget.h
@@ -434,7 +434,7 @@
 
 int __tgt_print_device_info(int64_t DeviceId);
 
-int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
+int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr,
                                  bool IsRecord, bool SaveOutput);
 
 #ifdef __cplusplus
Index: openmp/libomptarget/include/rtl.h
===================================================================
--- openmp/libomptarget/include/rtl.h
+++ openmp/libomptarget/include/rtl.h
@@ -72,7 +72,7 @@
   typedef int32_t(data_unlock_ty)(int32_t, void *);
   typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
   typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
-  typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
+  typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool, bool);
 
   int32_t Idx = -1;             // RTL index, index is the number of devices
                                 // of other RTLs that were registered before,
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2484,6 +2484,16 @@
   }
   Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
+  Error getDeviceMemorySize(uint64_t &Value) override {
+    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+      if (Pool->isGlobal()){
+        hsa_status_t Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+        return Plugin::check(Status, "Error in getDeviceMemorySize %s");
+      }
+    }
+    return Plugin::error("getDeviceMemorySize:: no global pool");
+  }
+
   /// AMDGPU-specific function to get device attributes.
   template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
     hsa_status_t Status =
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
===================================================================
--- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -641,6 +641,16 @@
   Error queryAsync(__tgt_async_info *AsyncInfo);
   virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
 
+  /// Check whether the architecture
+  virtual bool supportVAManagement() const { return false; }
+
+  /// De-allocates device memory and Unmaps the Virtual Addr
+  virtual Error memoryVAUnMap(void *VAddr, size_t Size);
+
+  /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda driver to map it to \p VAddr.
+  /// The obtained address is stored in \p Addr. At return \p RSize contains the actual size
+  virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
+
   /// Allocate data on the device or involving the device.
   Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
 
@@ -762,6 +772,8 @@
   uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
   virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
 
+  virtual Error getDeviceMemorySize(uint64_t &DSize);
+
   /// Get target compute unit kind (e.g., sm_80, or gfx908).
   virtual std::string getComputeUnitKind() const { return "unknown"; }
 
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -48,41 +48,88 @@
   void *MemoryStart;
   void *MemoryPtr;
   size_t MemorySize;
+  size_t TotalSize;
   GenericDeviceTy *Device;
   std::mutex AllocationLock;
 
   RRStatusTy Status;
   bool ReplaySaveOutput;
-  uint64_t DeviceMemorySize;
-
-  // Record/replay pre-allocates the largest possible device memory using the
-  // default kind.
-  // TODO: Expand allocation to include other kinds (device, host, shared) and
-  // possibly use a MemoryManager to track (de-)allocations for
-  // storing/retrieving when recording/replaying.
-  Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
-    // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
-    // of 1GB until allocation succeeds.
-    const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
+
+  void *suggestAddress(uint64_t MaxMemoryAllocation){
+    // Get a valid pointer address for this system
+    void *Addr = Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+    Device->free(Addr);
+    // Align Address to MaxMemoryAllocation
+    Addr = (void *) (((uintptr_t) Addr + (MaxMemoryAllocation - 1)) & (~(MaxMemoryAllocation - 1)));
+    // Pad Address by MaxMemoryAllocation to guarantee enough space
+    Addr = (void *) ((uintptr_t) Addr - MaxMemoryAllocation);
+    return Addr;
+  }
+
+  Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr){
+    size_t ASize = MaxMemoryAllocation;
+
+    if ( !VAddr && isRecording() ){
+      VAddr = suggestAddress(MaxMemoryAllocation);
+    }
+
+    DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
+
+    if ( auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize) )
+      return Err;
+
+    if ( isReplaying() && VAddr != MemoryStart ){
+      return Plugin::error("Record-Replay cannot assign the"
+          "requested recorded address (%p, %p)", VAddr, MemoryStart);
+    }
+
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+       "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
+
+    MemoryPtr = MemoryStart;
+    MemorySize = 0;
+    TotalSize = ASize;
+    return Plugin::success();
+  }
+
+  Error preAllocateHeurustic(uint64_t MaxMemoryAllocation, void *VAddr){
+    const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
     constexpr size_t STEP = 1024 * 1024 * 1024ULL;
     MemoryStart = nullptr;
-    for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
+    for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize> 0; TotalSize -= STEP) {
       MemoryStart =
-          Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+          Device->allocate(TotalSize, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
       if (MemoryStart)
         break;
     }
 
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+       "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize, MemoryStart);
+
     if (!MemoryStart)
       return Plugin::error("Allocating record/replay memory");
 
+    if ( VAddr && VAddr != MemoryStart )
+      return Plugin::error("Cannot allocate recorded address");
+
     MemoryPtr = MemoryStart;
     MemorySize = 0;
 
     return Plugin::success();
   }
 
-  void dumpDeviceMemory(StringRef Filename) {
+  Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr){
+      if ( Device->supportVAManagement() )
+        return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
+
+      uint64_t DevMemSize;
+      if ( Device->getDeviceMemorySize(DevMemSize) )
+        return Plugin::error("Cannot determine Device Memory Size");
+
+      return preAllocateHeurustic(DevMemSize, ReqVAddr);
+  }
+
+  void dumpDeviceMemory(StringRef Filename){
     ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
         WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
     if (!DeviceMemoryMB)
@@ -90,6 +137,7 @@
 
     auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
                                     MemoryStart, MemorySize, nullptr);
+
     if (Err)
       report_fatal_error("Error retrieving data for target pointer");
 
@@ -113,8 +161,7 @@
   bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
 
   RecordReplayTy()
-      : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
-        DeviceMemorySize(-1) {}
+      : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}
 
   void saveImage(const char *Name, const DeviceImageTy &Image) {
     SmallString<128> ImageName = {Name, ".image"};
@@ -134,7 +181,7 @@
     OS.close();
   }
 
-  void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
+  void dumpGlobals(StringRef Filename, DeviceImageTy &Image){
     int32_t Size = 0;
 
     for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
@@ -164,8 +211,9 @@
 
       auto Err = Plugin::success();
       {
-        if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr,
-                                            OffloadEntry.size, nullptr))
+        if (auto Err =
+                Device->dataRetrieve(BufferPtr, OffloadEntry.addr,
+                                     OffloadEntry.size, nullptr))
           report_fatal_error("Error retrieving data for global");
       }
       if (Err)
@@ -187,7 +235,7 @@
   void saveKernelInputInfo(const char *Name, DeviceImageTy &Image,
                            void **ArgPtrs, ptrdiff_t *ArgOffsets,
                            int32_t NumArgs, uint64_t NumTeamsClause,
-                           uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
+                           uint32_t ThreadLimitClause, uint64_t LoopTripCount){
     json::Object JsonKernelInfo;
     JsonKernelInfo["Name"] = Name;
     JsonKernelInfo["NumArgs"] = NumArgs;
@@ -196,6 +244,7 @@
     JsonKernelInfo["LoopTripCount"] = LoopTripCount;
     JsonKernelInfo["DeviceMemorySize"] = MemorySize;
     JsonKernelInfo["DeviceId"] = Device->getDeviceId();
+    JsonKernelInfo["BumpAllocVAStart"] = (intptr_t) MemoryStart;
 
     json::Array JsonArgPtrs;
     for (int I = 0; I < NumArgs; ++I)
@@ -239,31 +288,39 @@
     Alloc = MemoryPtr;
     MemoryPtr = (char *)MemoryPtr + AlignedSize;
     MemorySize += AlignedSize;
-    DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
+    DP("Memory Allocator return %p \n", Alloc);
     return Alloc;
   }
 
-  Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
+  Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr, RRStatusTy Status,
              bool SaveOutput) {
     this->Device = Device;
     this->Status = Status;
-    this->DeviceMemorySize = MemSize;
     this->ReplaySaveOutput = SaveOutput;
 
-    if (auto Err = preallocateDeviceMemory(MemSize))
+    if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
       return Err;
 
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
          "Record Replay Initialized (%p)"
          " as starting address, %lu Memory Size"
          " and set on status %s\n",
-         MemoryStart, MemSize,
+         MemoryStart, TotalSize,
          Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
 
     return Plugin::success();
   }
 
-  void deinit() { Device->free(MemoryStart); }
+  void deinit() {
+    if ( Device->supportVAManagement()  ){
+      if ( auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize) ){
+          report_fatal_error("Error on releasing virtual memory space");
+      }
+    }
+    else {
+      Device->free(MemoryStart);
+    }
+  }
 
 } RecordReplay;
 
@@ -1056,6 +1113,19 @@
   return queryAsyncImpl(*AsyncInfo);
 }
 
+
+Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize){
+  return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size){
+  return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
+  return Plugin::error("Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
+}
+
 Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
                                             TargetAllocTy Kind) {
   void *Alloc = nullptr;
@@ -1163,8 +1233,7 @@
   if (RecordReplay.isRecording())
     RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage());
 
-  auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
-                                  AsyncInfoWrapper);
+  auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs, AsyncInfoWrapper);
 
   // 'finalize' here to guarantee next record-replay actions are in-sync
   AsyncInfoWrapper.finalize(Err);
@@ -1425,7 +1494,9 @@
 }
 
 int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
-                                           uint64_t MemorySize, bool isRecord,
+                                           int64_t MemorySize,
+                                           void *VAddr,
+                                           bool isRecord,
                                            bool SaveOutput) {
   GenericPluginTy &Plugin = Plugin::get();
   GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
@@ -1433,7 +1504,7 @@
       isRecord ? RecordReplayTy::RRStatusTy::RRRecording
                : RecordReplayTy::RRStatusTy::RRReplaying;
 
-  if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
+  if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
     REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
            "(Error: %s)\n",
            MemorySize, toString(std::move(Err)).data());
Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
===================================================================
--- openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -24,8 +24,95 @@
 typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
 
+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
 #define CU_DEVICE_INVALID ((CUdevice)-2)
 
+typedef enum CUmemAllocationGranularity_flags_enum {
+    CU_MEM_ALLOC_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity for allocation */
+    CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for allocation for best performance */
+} CUmemAllocationGranularity_flags;
+
+typedef enum CUmemAccess_flags_enum {
+    CU_MEM_ACCESS_FLAGS_PROT_NONE        = 0x0,  /**< Default, make the address range not accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READ        = 0x1,  /**< Make the address range read accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READWRITE   = 0x3,  /**< Make the address range read-write accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_MAX         = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+typedef enum CUmemLocationType_enum {
+    CU_MEM_LOCATION_TYPE_INVALID = 0x0,
+    CU_MEM_LOCATION_TYPE_DEVICE  = 0x1,  /**< Location is a device location, thus id is a device ordinal */
+    CU_MEM_LOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemLocationType;
+
+typedef struct CUmemLocation_st {
+    CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                 /**< identifier for a given this location's ::CUmemLocationType. */
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+typedef struct CUmemAccessDesc_st {
+    CUmemLocation location;        /**< Location on which the request is to change it's accessibility */
+    CUmemAccess_flags flags;       /**< ::CUmemProt accessibility flags to set on the request */
+} CUmemAccessDesc_v1;
+
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+typedef enum CUmemAllocationType_enum {
+    CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    CU_MEM_ALLOCATION_TYPE_PINNED  = 0x1,
+    CU_MEM_ALLOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemAllocationType;
+
+typedef enum CUmemAllocationHandleType_enum {
+    CU_MEM_HANDLE_TYPE_NONE                  = 0x0,  /**< Does not allow any export mechanism. > */
+    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    CU_MEM_HANDLE_TYPE_WIN32                 = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    CU_MEM_HANDLE_TYPE_WIN32_KMT             = 0x4,  /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+    CU_MEM_HANDLE_TYPE_MAX                   = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+typedef struct CUmemAllocationProp_st {
+    /** Allocation type */
+    CUmemAllocationType type;
+    /** requested ::CUmemAllocationHandleType */
+    CUmemAllocationHandleType requestedHandleTypes;
+    /** Location of allocation */
+    CUmemLocation location;
+    /**
+     * Windows-specific POBJECT_ATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object atributes structure
+     * includes security attributes that define
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32HandleMetaData;
+    struct {
+         /**
+         * Allocation hint for requesting compressible memory.
+         * On devices that support Compute Data Compression, compressible
+         * memory can be used to accelerate accesses to data with unstructured
+         * sparsity and other compressible data patterns. Applications are
+         * expected to query allocation property of the handle obtained with
+         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to
+         * validate if the obtained allocation is compressible or not. Note that
+         * compressed memory may not be mappable on all devices.
+         */
+         unsigned char compressionType;
+         unsigned char gpuDirectRDMACapable;
+         /** Bitmask indicating intended usage for this allocation */
+         unsigned short usage;
+         unsigned char reserved[4];
+    } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
 typedef enum cudaError_enum {
   CUDA_SUCCESS = 0,
   CUDA_ERROR_INVALID_VALUE = 1,
@@ -268,4 +355,14 @@
 CUresult cuEventSynchronize(CUevent);
 CUresult cuEventDestroy(CUevent);
 
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
+CUresult cuMemGetInfo(size_t *free, size_t *total);
+CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
+CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
+CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);
+CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
+
 #endif
Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -79,6 +79,16 @@
 DLWRAP(cuEventSynchronize, 1)
 DLWRAP(cuEventDestroy, 1)
 
+DLWRAP(cuMemUnmap, 2);
+DLWRAP(cuMemRelease, 1);
+DLWRAP(cuMemAddressFree, 2);
+DLWRAP(cuMemGetInfo, 2);
+DLWRAP(cuMemAddressReserve, 5);
+DLWRAP(cuMemMap, 5);
+DLWRAP(cuMemCreate, 4);
+DLWRAP(cuMemSetAccess, 4);
+DLWRAP(cuMemGetAllocationGranularity, 3);
+
 DLWRAP_FINALIZE()
 
 #ifndef DYNAMIC_CUDA_PATH
Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -517,6 +517,109 @@
     return Plugin::check(Res, "Error in cuStreamQuery: %s");
   }
 
+  /// CUDA support VA management
+  bool supportVAManagement() const override { return true; }
+
+  /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda driver to map it to \p VAddr.
+  /// The obtained address is stored in \p Addr. At return \p RSize contains the actual size
+  Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
+    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+    auto IHandle = DeviceMMaps.find(DVAddr);
+    size_t Size = *RSize;
+
+    if ( Size == 0 )
+      return Plugin::error("Memory Map Size must be larger than 0");
+
+    // Check if we have already mapped this address
+    if ( IHandle != DeviceMMaps.end() )
+      return Plugin::error("Address already memory mapped");
+
+    CUmemAllocationProp Prop = {};
+    size_t Granularity = 0;
+
+    size_t Free, Total;
+    CUresult Res = cuMemGetInfo(&Free, &Total);
+    if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
+        return std::move(Err);
+
+    if ( Size >= Free ){
+        *Addr = nullptr;
+        return Plugin::error("Canot map memory size larger than the available device memory");
+    }
+
+    // currently NVidia only supports pinned device types
+    Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+
+    Prop.location.id = DeviceId;
+    cuMemGetAllocationGranularity(&Granularity, &Prop,
+                                  CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+    if (auto Err = Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
+        return std::move(Err);
+
+    if ( Granularity == 0 )
+        return Plugin::error("Wrong device Page size");
+
+    // Ceil to page size.
+    Size = (Size+Granularity-1)/Granularity * Granularity;
+
+    // Create a handler of our allocation
+    CUmemGenericAllocationHandle AHandle;
+    Res = cuMemCreate(&AHandle, Size, &Prop, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
+        return std::move(Err);
+
+    CUdeviceptr DevPtr = 0;
+    Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
+        return std::move(Err);
+
+    Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
+        return std::move(Err);
+
+    CUmemAccessDesc ADesc = {};
+    ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    ADesc.location.id = DeviceId;
+    ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+    // Sets address
+    Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
+    if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
+        return std::move(Err);
+
+    *Addr = reinterpret_cast<void *>(DevPtr);
+    *RSize = Size;
+    DeviceMMaps.insert( { DevPtr, AHandle } );
+    return Plugin::success();
+  }
+
+  /// De-allocates device memory and Unmaps the Virtual Addr
+  Error memoryVAUnMap(void *VAddr, size_t Size) override {
+    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+    auto IHandle = DeviceMMaps.find(DVAddr);
+    // Mapping does not exist
+    if ( IHandle == DeviceMMaps.end() ){
+      return Plugin::error("Addr is not MemoryMapped");
+    }
+    CUmemGenericAllocationHandle& AllocHandle = IHandle->second;
+
+    CUresult Res = cuMemUnmap(DVAddr, Size);
+    if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
+      return std::move(Err);
+
+    Res = cuMemRelease(AllocHandle);
+    if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
+      return std::move(Err);
+
+    Res = cuMemAddressFree(DVAddr, Size);
+    if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
+      return std::move(Err);
+
+    DeviceMMaps.erase(IHandle);
+    return Plugin::success();
+  }
+
   Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
     // TODO: Register the buffer as CUDA host memory.
     return HstPtr;
@@ -835,6 +938,11 @@
     return Plugin::check(Res, "Error in cuCtxGetLimit: %s");
   }
 
+  Error getDeviceMemorySize(uint64_t &Value) override {
+    CUresult Res = cuDeviceTotalMem(&Value, Device);
+    return Plugin::check(Res , "Error in getDeviceMemorySize %s");
+  }
+
   /// CUDA-specific function to get device attributes.
   Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
     // TODO: Warn if the new value is larger than the old.
@@ -872,6 +980,9 @@
   /// The CUDA device handler.
   CUdevice Device = CU_DEVICE_INVALID;
 
+  /// The memory memory mapped addresses and their handlers
+  std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
+
   /// The compute capability of the corresponding CUDA device.
   struct ComputeCapabilityTy {
     uint32_t Major;
Index: openmp/libomptarget/src/device.cpp
===================================================================
--- openmp/libomptarget/src/device.cpp
+++ openmp/libomptarget/src/device.cpp
@@ -546,6 +546,7 @@
 
     RTL->activate_record_replay(RTLDeviceID,
                                 OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
+                                nullptr,
                                 true, OMPX_ReplaySaveOutput);
   }
 
Index: openmp/libomptarget/src/interface.cpp
===================================================================
--- openmp/libomptarget/src/interface.cpp
+++ openmp/libomptarget/src/interface.cpp
@@ -338,11 +338,13 @@
 /// \param DeviceId The device identifier to execute the target region.
 /// \param MemorySize The number of bytes to be (pre-)allocated
 ///                   by the bump allocator
+/// \param VAddr The Virtual Address to assing to the initial
+///                               address of the BumpAllocator
 /// /param IsRecord Activates the record replay mechanism in
 ///                 'record' mode or 'replay' mode.
 /// /param SaveOutput Store the device memory after kernel
 ///                   execution on persistent storage
-EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
+EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr,
                                         bool IsRecord, bool SaveOutput) {
   if (!deviceIsReady(DeviceId)) {
     DP("Device %" PRId64 " is not ready\n", DeviceId);
@@ -350,7 +352,7 @@
   }
 
   DeviceTy &Device = *PM->Devices[DeviceId];
-  int Rc = target_activate_rr(Device, MemorySize, IsRecord, SaveOutput);
+  int Rc = target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
   assert(Rc == OFFLOAD_SUCCESS &&
          "__tgt_activate_record_replay unexpected failure!");
   return OMP_TGT_SUCCESS;
Index: openmp/libomptarget/src/omptarget.cpp
===================================================================
--- openmp/libomptarget/src/omptarget.cpp
+++ openmp/libomptarget/src/omptarget.cpp
@@ -1715,9 +1715,9 @@
 /// Enables the record replay mechanism by pre-allocating MemorySize
 /// and informing the record-replayer of whether to store the output
 /// in some file.
-int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord,
+int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr, bool isRecord,
                        bool SaveOutput) {
-  return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize,
+  return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
                                             isRecord, SaveOutput);
 }
 
Index: openmp/libomptarget/src/private.h
===================================================================
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -41,7 +41,7 @@
 extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                   KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
 
-extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
+extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *reqAddr,
                               bool isRecord, bool SaveOutput);
 
 extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
Index: openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
===================================================================
--- openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -88,6 +88,8 @@
     TgtArgOffsets.push_back(
         reinterpret_cast<ptrdiff_t>(It.getAsInteger().value()));
 
+  void *BAllocStart = reinterpret_cast<void*>(JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
+
   __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
   std::string KernelEntryName = KernelFunc.value().str();
   KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
@@ -126,7 +128,7 @@
 
   __tgt_register_lib(&Desc);
 
-  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false,
+  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart, false,
                                         VerifyOpt);
 
   if (Rc != OMP_TGT_SUCCESS) {