Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -434,7 +434,7 @@ int __tgt_print_device_info(int64_t DeviceId); -int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, +int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr, bool IsRecord, bool SaveOutput); #ifdef __cplusplus Index: openmp/libomptarget/include/rtl.h =================================================================== --- openmp/libomptarget/include/rtl.h +++ openmp/libomptarget/include/rtl.h @@ -72,7 +72,7 @@ typedef int32_t(data_unlock_ty)(int32_t, void *); typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t); typedef int32_t(data_notify_unmapped_ty)(int32_t, void *); - typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool); + typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool, bool); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2484,6 +2484,16 @@ } Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); } + Error getDeviceMemorySize(uint64_t &Value) override { + for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { + if (Pool->isGlobal()){ + hsa_status_t Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value); + return Plugin::check(Status, "Error in getDeviceMemorySize %s"); + } + } + return Plugin::error("getDeviceMemorySize:: no global pool"); + } + /// AMDGPU-specific function to get device attributes. template Error getDeviceAttr(uint32_t Kind, Ty &Value) { hsa_status_t Status = Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -641,6 +641,16 @@ Error queryAsync(__tgt_async_info *AsyncInfo); virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0; + /// Check whether the architecture + virtual bool supportVAManagement() const { return false; } + + /// De-allocates device memory and Unmaps the Virtual Addr + virtual Error memoryVAUnMap(void *VAddr, size_t Size); + + /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda driver to map it to \p VAddr. + /// The obtained address is stored in \p Addr. At return \p RSize contains the actual size + virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize); + /// Allocate data on the device or involving the device. Expected dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind); @@ -762,6 +772,8 @@ uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } + virtual Error getDeviceMemorySize(uint64_t &DSize); + /// Get target compute unit kind (e.g., sm_80, or gfx908). virtual std::string getComputeUnitKind() const { return "unknown"; } Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -48,41 +48,88 @@ void *MemoryStart; void *MemoryPtr; size_t MemorySize; + size_t TotalSize; GenericDeviceTy *Device; std::mutex AllocationLock; RRStatusTy Status; bool ReplaySaveOutput; - uint64_t DeviceMemorySize; - - // Record/replay pre-allocates the largest possible device memory using the - // default kind. - // TODO: Expand allocation to include other kinds (device, host, shared) and - // possibly use a MemoryManager to track (de-)allocations for - // storing/retrieving when recording/replaying. - Error preallocateDeviceMemory(uint64_t DeviceMemorySize) { - // Pre-allocate memory on device. Starts with 64GB and subtracts in steps - // of 1GB until allocation succeeds. - const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize; + + void *suggestAddress(uint64_t MaxMemoryAllocation){ + // Get a valid pointer address for this system + void *Addr = Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT); + Device->free(Addr); + // Align Address to MaxMemoryAllocation + Addr = (void *) (((uintptr_t) Addr + (MaxMemoryAllocation - 1)) & (~(MaxMemoryAllocation - 1))); + // Pad Address by MaxMemoryAllocation to guarantee enough space + Addr = (void *) ((uintptr_t) Addr - MaxMemoryAllocation); + return Addr; + } + + Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr){ + size_t ASize = MaxMemoryAllocation; + + if ( !VAddr && isRecording() ){ + VAddr = suggestAddress(MaxMemoryAllocation); + } + + DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr); + + if ( auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize) ) + return Err; + + if ( isReplaying() && VAddr != MemoryStart ){ + return Plugin::error("Record-Replay cannot assign the" + "requested recorded address (%p, %p)", VAddr, MemoryStart); + } + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), + "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart); + + MemoryPtr = MemoryStart; + MemorySize = 0; + TotalSize = ASize; + return Plugin::success(); + } + + Error preAllocateHeurustic(uint64_t MaxMemoryAllocation, void *VAddr){ + const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation; constexpr size_t STEP = 1024 * 1024 * 1024ULL; MemoryStart = nullptr; - for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) { + for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize> 0; TotalSize -= STEP) { MemoryStart = - Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT); + Device->allocate(TotalSize, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT); if (MemoryStart) break; } + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), + "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize, MemoryStart); + if (!MemoryStart) return Plugin::error("Allocating record/replay memory"); + if ( VAddr && VAddr != MemoryStart ) + return Plugin::error("Cannot allocate recorded address"); + MemoryPtr = MemoryStart; MemorySize = 0; return Plugin::success(); } - void dumpDeviceMemory(StringRef Filename) { + Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr){ + if ( Device->supportVAManagement() ) + return preAllocateVAMemory(DeviceMemorySize, ReqVAddr); + + uint64_t DevMemSize; + if ( Device->getDeviceMemorySize(DevMemSize) ) + return Plugin::error("Cannot determine Device Memory Size"); + + return preAllocateHeurustic(DevMemSize, ReqVAddr); + } + + void dumpDeviceMemory(StringRef Filename){ ErrorOr> DeviceMemoryMB = WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); if (!DeviceMemoryMB) @@ -90,6 +137,7 @@ auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), MemoryStart, MemorySize, nullptr); + if (Err) report_fatal_error("Error retrieving data for target pointer"); @@ -113,8 +161,7 @@ bool isSaveOutputEnabled() const { return ReplaySaveOutput; } RecordReplayTy() - : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false), - DeviceMemorySize(-1) {} + : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {} void saveImage(const char *Name, const DeviceImageTy &Image) { SmallString<128> ImageName = {Name, ".image"}; @@ -134,7 +181,7 @@ OS.close(); } - void dumpGlobals(StringRef Filename, DeviceImageTy &Image) { + void dumpGlobals(StringRef Filename, DeviceImageTy &Image){ int32_t Size = 0; for (auto &OffloadEntry : Image.getOffloadEntryTable()) { @@ -164,8 +211,9 @@ auto Err = Plugin::success(); { - if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr, - OffloadEntry.size, nullptr)) + if (auto Err = + Device->dataRetrieve(BufferPtr, OffloadEntry.addr, + OffloadEntry.size, nullptr)) report_fatal_error("Error retrieving data for global"); } if (Err) @@ -187,7 +235,7 @@ void saveKernelInputInfo(const char *Name, DeviceImageTy &Image, void **ArgPtrs, ptrdiff_t *ArgOffsets, int32_t NumArgs, uint64_t NumTeamsClause, - uint32_t ThreadLimitClause, uint64_t LoopTripCount) { + uint32_t ThreadLimitClause, uint64_t LoopTripCount){ json::Object JsonKernelInfo; JsonKernelInfo["Name"] = Name; JsonKernelInfo["NumArgs"] = NumArgs; @@ -196,6 +244,7 @@ JsonKernelInfo["LoopTripCount"] = LoopTripCount; JsonKernelInfo["DeviceMemorySize"] = MemorySize; JsonKernelInfo["DeviceId"] = Device->getDeviceId(); + JsonKernelInfo["BumpAllocVAStart"] = (intptr_t) MemoryStart; json::Array JsonArgPtrs; for (int I = 0; I < NumArgs; ++I) @@ -239,31 +288,39 @@ Alloc = MemoryPtr; MemoryPtr = (char *)MemoryPtr + AlignedSize; MemorySize += AlignedSize; - DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc)); + DP("Memory Allocator return %p \n", Alloc); return Alloc; } - Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status, + Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr, RRStatusTy Status, bool SaveOutput) { this->Device = Device; this->Status = Status; - this->DeviceMemorySize = MemSize; this->ReplaySaveOutput = SaveOutput; - if (auto Err = preallocateDeviceMemory(MemSize)) + if (auto Err = preallocateDeviceMemory(MemSize, VAddr)) return Err; INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), "Record Replay Initialized (%p)" " as starting address, %lu Memory Size" " and set on status %s\n", - MemoryStart, MemSize, + MemoryStart, TotalSize, Status == RRStatusTy::RRRecording ? "Recording" : "Replaying"); return Plugin::success(); } - void deinit() { Device->free(MemoryStart); } + void deinit() { + if ( Device->supportVAManagement() ){ + if ( auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize) ){ + report_fatal_error("Error on releasing virtual memory space"); + } + } + else { + Device->free(MemoryStart); + } + } } RecordReplay; @@ -1056,6 +1113,19 @@ return queryAsyncImpl(*AsyncInfo); } + +Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize){ + return Plugin::error("Device does not suppport VA Management"); +} + +Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size){ + return Plugin::error("Device does not suppport VA Management"); +} + +Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) { + return Plugin::error("Mising getDeviceMemorySize impelmentation (required by RR-heuristic"); +} + Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind) { void *Alloc = nullptr; @@ -1163,8 +1233,7 @@ if (RecordReplay.isRecording()) RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage()); - auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs, - AsyncInfoWrapper); + auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs, AsyncInfoWrapper); // 'finalize' here to guarantee next record-replay actions are in-sync AsyncInfoWrapper.finalize(Err); @@ -1425,7 +1494,9 @@ } int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, - uint64_t MemorySize, bool isRecord, + int64_t MemorySize, + void *VAddr, + bool isRecord, bool SaveOutput) { GenericPluginTy &Plugin = Plugin::get(); GenericDeviceTy &Device = Plugin.getDevice(DeviceId); @@ -1433,7 +1504,7 @@ isRecord ? RecordReplayTy::RRStatusTy::RRRecording : RecordReplayTy::RRStatusTy::RRReplaying; - if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) { + if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) { REPORT("WARNING RR did not intialize RR-properly with %lu bytes" "(Error: %s)\n", MemorySize, toString(std::move(Err)).data()); Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h =================================================================== --- openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -24,8 +24,95 @@ typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; +typedef unsigned long long CUmemGenericAllocationHandle_v1; +typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; + #define CU_DEVICE_INVALID ((CUdevice)-2) +typedef enum CUmemAllocationGranularity_flags_enum { + CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ + CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ +} CUmemAllocationGranularity_flags; + +typedef enum CUmemAccess_flags_enum { + CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ + CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF +} CUmemAccess_flags; + +typedef enum CUmemLocationType_enum { + CU_MEM_LOCATION_TYPE_INVALID = 0x0, + CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ + CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemLocationType; + +typedef struct CUmemLocation_st { + CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ + int id; /**< identifier for a given this location's ::CUmemLocationType. */ +} CUmemLocation_v1; +typedef CUmemLocation_v1 CUmemLocation; + +typedef struct CUmemAccessDesc_st { + CUmemLocation location; /**< Location on which the request is to change it's accessibility */ + CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ +} CUmemAccessDesc_v1; + +typedef CUmemAccessDesc_v1 CUmemAccessDesc; + +typedef enum CUmemAllocationType_enum { + CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, + + /** This allocation type is 'pinned', i.e. cannot migrate from its current + * location while the application is actively using it + */ + CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, + CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationType; + +typedef enum CUmemAllocationHandleType_enum { + CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ + CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ + CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ + CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationHandleType; + +typedef struct CUmemAllocationProp_st { + /** Allocation type */ + CUmemAllocationType type; + /** requested ::CUmemAllocationHandleType */ + CUmemAllocationHandleType requestedHandleTypes; + /** Location of allocation */ + CUmemLocation location; + /** + * Windows-specific POBJECT_ATTRIBUTES required when + * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object atributes structure + * includes security attributes that define + * the scope of which exported allocations may be tranferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32HandleMetaData; + struct { + /** + * Allocation hint for requesting compressible memory. + * On devices that support Compute Data Compression, compressible + * memory can be used to accelerate accesses to data with unstructured + * sparsity and other compressible data patterns. Applications are + * expected to query allocation property of the handle obtained with + * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to + * validate if the obtained allocation is compressible or not. Note that + * compressed memory may not be mappable on all devices. + */ + unsigned char compressionType; + unsigned char gpuDirectRDMACapable; + /** Bitmask indicating intended usage for this allocation */ + unsigned short usage; + unsigned char reserved[4]; + } allocFlags; +} CUmemAllocationProp_v1; +typedef CUmemAllocationProp_v1 CUmemAllocationProp; + typedef enum cudaError_enum { CUDA_SUCCESS = 0, CUDA_ERROR_INVALID_VALUE = 1, @@ -268,4 +355,14 @@ CUresult cuEventSynchronize(CUevent); CUresult cuEventDestroy(CUevent); +CUresult cuMemUnmap(CUdeviceptr ptr, size_t size); +CUresult cuMemRelease(CUmemGenericAllocationHandle handle); +CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size); +CUresult cuMemGetInfo(size_t *free, size_t *total); +CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); +CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); +CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); +CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); +CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); + #endif Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -79,6 +79,16 @@ DLWRAP(cuEventSynchronize, 1) DLWRAP(cuEventDestroy, 1) +DLWRAP(cuMemUnmap, 2); +DLWRAP(cuMemRelease, 1); +DLWRAP(cuMemAddressFree, 2); +DLWRAP(cuMemGetInfo, 2); +DLWRAP(cuMemAddressReserve, 5); +DLWRAP(cuMemMap, 5); +DLWRAP(cuMemCreate, 4); +DLWRAP(cuMemSetAccess, 4); +DLWRAP(cuMemGetAllocationGranularity, 3); + DLWRAP_FINALIZE() #ifndef DYNAMIC_CUDA_PATH Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -517,6 +517,109 @@ return Plugin::check(Res, "Error in cuStreamQuery: %s"); } + /// CUDA support VA management + bool supportVAManagement() const override { return true; } + + /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda driver to map it to \p VAddr. + /// The obtained address is stored in \p Addr. At return \p RSize contains the actual size + Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override { + CUdeviceptr DVAddr = reinterpret_cast(VAddr); + auto IHandle = DeviceMMaps.find(DVAddr); + size_t Size = *RSize; + + if ( Size == 0 ) + return Plugin::error("Memory Map Size must be larger than 0"); + + // Check if we have already mapped this address + if ( IHandle != DeviceMMaps.end() ) + return Plugin::error("Address already memory mapped"); + + CUmemAllocationProp Prop = {}; + size_t Granularity = 0; + + size_t Free, Total; + CUresult Res = cuMemGetInfo(&Free, &Total); + if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s")) + return std::move(Err); + + if ( Size >= Free ){ + *Addr = nullptr; + return Plugin::error("Canot map memory size larger than the available device memory"); + } + + // currently NVidia only supports pinned device types + Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + + Prop.location.id = DeviceId; + cuMemGetAllocationGranularity(&Granularity, &Prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM); + if (auto Err = Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s")) + return std::move(Err); + + if ( Granularity == 0 ) + return Plugin::error("Wrong device Page size"); + + // Ceil to page size. + Size = (Size+Granularity-1)/Granularity * Granularity; + + // Create a handler of our allocation + CUmemGenericAllocationHandle AHandle; + Res = cuMemCreate(&AHandle, Size, &Prop, 0); + if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s")) + return std::move(Err); + + CUdeviceptr DevPtr = 0; + Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0); + if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s")) + return std::move(Err); + + Res = cuMemMap(DevPtr, Size, 0, AHandle, 0); + if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s")) + return std::move(Err); + + CUmemAccessDesc ADesc = {}; + ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + ADesc.location.id = DeviceId; + ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + // Sets address + Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1); + if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s")) + return std::move(Err); + + *Addr = reinterpret_cast(DevPtr); + *RSize = Size; + DeviceMMaps.insert( { DevPtr, AHandle } ); + return Plugin::success(); + } + + /// De-allocates device memory and Unmaps the Virtual Addr + Error memoryVAUnMap(void *VAddr, size_t Size) override { + CUdeviceptr DVAddr = reinterpret_cast(VAddr); + auto IHandle = DeviceMMaps.find(DVAddr); + // Mapping does not exist + if ( IHandle == DeviceMMaps.end() ){ + return Plugin::error("Addr is not MemoryMapped"); + } + CUmemGenericAllocationHandle& AllocHandle = IHandle->second; + + CUresult Res = cuMemUnmap(DVAddr, Size); + if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s")) + return std::move(Err); + + Res = cuMemRelease(AllocHandle); + if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s")) + return std::move(Err); + + Res = cuMemAddressFree(DVAddr, Size); + if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s")) + return std::move(Err); + + DeviceMMaps.erase(IHandle); + return Plugin::success(); + } + Expected dataLockImpl(void *HstPtr, int64_t Size) override { // TODO: Register the buffer as CUDA host memory. return HstPtr; @@ -835,6 +938,11 @@ return Plugin::check(Res, "Error in cuCtxGetLimit: %s"); } + Error getDeviceMemorySize(uint64_t &Value) override { + CUresult Res = cuDeviceTotalMem(&Value, Device); + return Plugin::check(Res , "Error in getDeviceMemorySize %s"); + } + /// CUDA-specific function to get device attributes. Error getDeviceAttr(uint32_t Kind, uint32_t &Value) { // TODO: Warn if the new value is larger than the old. @@ -872,6 +980,9 @@ /// The CUDA device handler. CUdevice Device = CU_DEVICE_INVALID; + /// The memory memory mapped addresses and their handlers + std::unordered_map DeviceMMaps; + /// The compute capability of the corresponding CUDA device. struct ComputeCapabilityTy { uint32_t Major; Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -546,6 +546,7 @@ RTL->activate_record_replay(RTLDeviceID, OMPX_DeviceMemorySize * 1024 * 1024 * 1024, + nullptr, true, OMPX_ReplaySaveOutput); } Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -338,11 +338,13 @@ /// \param DeviceId The device identifier to execute the target region. /// \param MemorySize The number of bytes to be (pre-)allocated /// by the bump allocator +/// \param VAddr The Virtual Address to assing to the initial +/// address of the BumpAllocator /// /param IsRecord Activates the record replay mechanism in /// 'record' mode or 'replay' mode. /// /param SaveOutput Store the device memory after kernel /// execution on persistent storage -EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, +EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr, bool IsRecord, bool SaveOutput) { if (!deviceIsReady(DeviceId)) { DP("Device %" PRId64 " is not ready\n", DeviceId); @@ -350,7 +352,7 @@ } DeviceTy &Device = *PM->Devices[DeviceId]; - int Rc = target_activate_rr(Device, MemorySize, IsRecord, SaveOutput); + int Rc = target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput); assert(Rc == OFFLOAD_SUCCESS && "__tgt_activate_record_replay unexpected failure!"); return OMP_TGT_SUCCESS; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -1715,9 +1715,9 @@ /// Enables the record replay mechanism by pre-allocating MemorySize /// and informing the record-replayer of whether to store the output /// in some file. -int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord, +int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr, bool isRecord, bool SaveOutput) { - return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, + return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr, isRecord, SaveOutput); } Index: openmp/libomptarget/src/private.h =================================================================== --- openmp/libomptarget/src/private.h +++ openmp/libomptarget/src/private.h @@ -41,7 +41,7 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo); -extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, +extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *reqAddr, bool isRecord, bool SaveOutput); extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, Index: openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp =================================================================== --- openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp +++ openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -88,6 +88,8 @@ TgtArgOffsets.push_back( reinterpret_cast(It.getAsInteger().value())); + void *BAllocStart = reinterpret_cast(JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value()); + __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0}; std::string KernelEntryName = KernelFunc.value().str(); KernelEntry.name = const_cast(KernelEntryName.c_str()); @@ -126,7 +128,7 @@ __tgt_register_lib(&Desc); - int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false, + int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart, false, VerifyOpt); if (Rc != OMP_TGT_SUCCESS) {