diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -433,6 +433,10 @@ void __tgt_set_info_flag(uint32_t); int __tgt_print_device_info(int64_t DeviceId); + +int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, + bool IsRecord, bool SaveOutput); + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -72,6 +72,7 @@ typedef int32_t(data_unlock_ty)(int32_t, void *); typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t); typedef int32_t(data_notify_unmapped_ty)(int32_t, void *); + typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -124,6 +125,7 @@ data_unlock_ty *data_unlock = nullptr; data_notify_mapped_ty *data_notify_mapped = nullptr; data_notify_unmapped_ty *data_notify_unmapped = nullptr; + activate_record_replay_ty *activate_record_replay = nullptr; // Are there images associated with this RTL. bool IsUsed = false; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp @@ -358,5 +358,7 @@ auto BitcodeTA = Triple(ActualTriple).getArch(); BitcodeImageMap[Image.ImageStart] = BitcodeTA; + DP("Is%s IR Image\n", BitcodeTA == TT.getArch() ? " " : " NOT"); + return BitcodeTA == TT.getArch(); } diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -194,6 +194,11 @@ private: __tgt_target_table TTTablePtr; llvm::SmallVector<__tgt_offload_entry> Entries; + + public: + using const_iterator = decltype(Entries)::const_iterator; + const_iterator begin() const { return Entries.begin(); } + const_iterator end() const { return Entries.end(); } }; /// Image identifier within the corresponding device. Notice that this id is @@ -274,6 +279,12 @@ /// Get the kernel name. const char *getName() const { return Name; } + /// Get the kernel image. + DeviceImageTy &getImage() const { + assert(ImagePtr && "Kernel is not initialized!"); + return *ImagePtr; + } + /// Indicate whether an execution mode is valid. static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { switch (ExecutionMode) { @@ -343,6 +354,9 @@ /// The execution flags of the kernel. OMPTgtExecModeFlags ExecutionMode; + /// The image that contains this kernel. + DeviceImageTy *ImagePtr = nullptr; + protected: /// The preferred number of threads to run the kernel. uint32_t PreferredNumThreads; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -39,6 +39,10 @@ // TODO: Fix any thread safety issues for multi-threaded kernel recording. struct RecordReplayTy { + + // Describes the state of the record replay mechanism. + enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying }; + private: // Memory pointers for recording, replaying memory. void *MemoryStart; @@ -47,26 +51,19 @@ GenericDeviceTy *Device; std::mutex AllocationLock; - // Environment variables for record and replay. - // Enables recording kernels if set. - BoolEnvar OMPX_RecordKernel; - // Enables replaying a kernel if set. - BoolEnvar OMPX_ReplayKernel; - // Enables saving the device memory kernel output post execution if set. - BoolEnvar OMPX_ReplaySaveOutput; - // Sets the maximum to pre-allocate device memory. - UInt32Envar OMPX_DeviceMemorySize; + RRStatusTy Status; + bool ReplaySaveOutput; + uint64_t DeviceMemorySize; // Record/replay pre-allocates the largest possible device memory using the // default kind. // TODO: Expand allocation to include other kinds (device, host, shared) and // possibly use a MemoryManager to track (de-)allocations for // storing/retrieving when recording/replaying. - Error preallocateDeviceMemory() { + Error preallocateDeviceMemory(uint64_t DeviceMemorySize) { // Pre-allocate memory on device. Starts with 64GB and subtracts in steps // of 1GB until allocation succeeds. - const size_t MAX_MEMORY_ALLOCATION = - OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL; + const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize; constexpr size_t STEP = 1024 * 1024 * 1024ULL; MemoryStart = nullptr; for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) { @@ -85,15 +82,14 @@ return Plugin::success(); } - void dumpDeviceMemory(StringRef Filename, - AsyncInfoWrapperTy &AsyncInfoWrapper) { + void dumpDeviceMemory(StringRef Filename) { ErrorOr> DeviceMemoryMB = WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); if (!DeviceMemoryMB) report_fatal_error("Error creating MemoryBuffer for device memory"); auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), - MemoryStart, MemorySize, AsyncInfoWrapper); + MemoryStart, MemorySize, nullptr); if (Err) report_fatal_error("Error retrieving data for target pointer"); @@ -108,21 +104,19 @@ } public: - bool isRecording() const { return OMPX_RecordKernel; } - bool isReplaying() const { return OMPX_ReplayKernel; } + bool isRecording() const { return Status == RRStatusTy::RRRecording; } + bool isReplaying() const { return Status == RRStatusTy::RRReplaying; } bool isRecordingOrReplaying() const { - return (OMPX_RecordKernel || OMPX_ReplayKernel); + return (Status != RRStatusTy::RRDeactivated); } - bool isSaveOutputEnabled() const { return OMPX_ReplaySaveOutput; } + void setStatus(RRStatusTy Status) { this->Status = Status; } + bool isSaveOutputEnabled() const { return ReplaySaveOutput; } RecordReplayTy() - : OMPX_RecordKernel("LIBOMPTARGET_RECORD"), - OMPX_ReplayKernel("LIBOMPTARGET_REPLAY"), - OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT"), - OMPX_DeviceMemorySize("LIBOMPTARGET_RR_DEVMEM_SIZE", - /* Default in GB */ 64) {} + : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false), + DeviceMemorySize(-1) {} - void saveImage(const char *Name, DeviceImageTy &Image) { + void saveImage(const char *Name, const DeviceImageTy &Image) { SmallString<128> ImageName = {Name, ".image"}; std::error_code EC; raw_fd_ostream OS(ImageName, EC); @@ -140,11 +134,60 @@ OS.close(); } - void saveKernelInputInfo(const char *Name, void **ArgPtrs, - ptrdiff_t *ArgOffsets, int32_t NumArgs, - uint64_t NumTeamsClause, uint32_t ThreadLimitClause, - uint64_t LoopTripCount, - AsyncInfoWrapperTy &AsyncInfoWrapper) { + void dumpGlobals(StringRef Filename, DeviceImageTy &Image) { + int32_t Size = 0; + + for (auto &OffloadEntry : Image.getOffloadEntryTable()) { + if (!OffloadEntry.size) + continue; + Size += std::strlen(OffloadEntry.name) + /* '\0' */ 1 + + /* OffloadEntry.size value */ sizeof(uint32_t) + + OffloadEntry.size; + } + + ErrorOr> GlobalsMB = + WritableMemoryBuffer::getNewUninitMemBuffer(Size); + if (!GlobalsMB) + report_fatal_error("Error creating MemoryBuffer for globals memory"); + + void *BufferPtr = GlobalsMB.get()->getBufferStart(); + for (auto &OffloadEntry : Image.getOffloadEntryTable()) { + if (!OffloadEntry.size) + continue; + + int32_t NameLength = std::strlen(OffloadEntry.name) + 1; + memcpy(BufferPtr, OffloadEntry.name, NameLength); + BufferPtr = advanceVoidPtr(BufferPtr, NameLength); + + *((uint32_t *)(BufferPtr)) = OffloadEntry.size; + BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); + + auto Err = Plugin::success(); + { + if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr, + OffloadEntry.size, nullptr)) + report_fatal_error("Error retrieving data for global"); + } + if (Err) + report_fatal_error("Error retrieving data for global"); + BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size); + } + assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && + "Buffer over/under-filled."); + assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) && + "Buffer size mismatch"); + + StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size); + std::error_code EC; + raw_fd_ostream OS(Filename, EC); + OS << GlobalsMemory; + OS.close(); + } + + void saveKernelInputInfo(const char *Name, DeviceImageTy &Image, + void **ArgPtrs, ptrdiff_t *ArgOffsets, + int32_t NumArgs, uint64_t NumTeamsClause, + uint32_t ThreadLimitClause, uint64_t LoopTripCount) { json::Object JsonKernelInfo; JsonKernelInfo["Name"] = Name; JsonKernelInfo["NumArgs"] = NumArgs; @@ -165,7 +208,10 @@ JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets)); SmallString<128> MemoryFilename = {Name, ".memory"}; - dumpDeviceMemory(MemoryFilename, AsyncInfoWrapper); + dumpDeviceMemory(MemoryFilename); + + SmallString<128> GlobalsFilename = {Name, ".globals"}; + dumpGlobals(GlobalsFilename, Image); SmallString<128> JsonFilename = {Name, ".json"}; std::error_code EC; @@ -177,11 +223,10 @@ JsonOS.close(); } - void saveKernelOutputInfo(const char *Name, - AsyncInfoWrapperTy &AsyncInfoWrapper) { + void saveKernelOutputInfo(const char *Name) { SmallString<128> OutputFilename = { Name, (isRecording() ? ".original.output" : ".replay.output")}; - dumpDeviceMemory(OutputFilename, AsyncInfoWrapper); + dumpDeviceMemory(OutputFilename); } void *alloc(uint64_t Size) { @@ -194,12 +239,28 @@ Alloc = MemoryPtr; MemoryPtr = (char *)MemoryPtr + AlignedSize; MemorySize += AlignedSize; + DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc)); return Alloc; } - Error init(GenericDeviceTy *Device) { + Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status, + bool SaveOutput) { this->Device = Device; - return preallocateDeviceMemory(); + this->Status = Status; + this->DeviceMemorySize = MemSize; + this->ReplaySaveOutput = SaveOutput; + + if (auto Err = preallocateDeviceMemory(MemSize)) + return Err; + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), + "Record Replay Initialized (%p)" + " as starting address, %lu Memory Size" + " and set on status %s\n", + MemoryStart, MemSize, + Status == RRStatusTy::RRRecording ? "Recording" : "Replaying"); + + return Plugin::success(); } void deinit() { Device->free(MemoryStart); } @@ -227,7 +288,11 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) { + + ImagePtr = &Image; + PreferredNumThreads = GenericDevice.getDefaultNumThreads(); + MaxNumThreads = GenericDevice.getThreadLimit(); return initImpl(GenericDevice, Image); @@ -468,10 +533,6 @@ if (EnableMM) MemoryManager = new MemoryManagerTy(*this, ThresholdMM); - if (RecordReplay.isRecordingOrReplaying()) - if (auto Err = RecordReplay.init(this)) - return Err; - return Plugin::success(); } @@ -1087,26 +1148,31 @@ ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo) { - AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + AsyncInfoWrapperTy AsyncInfoWrapper( + *this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo); GenericKernelTy &GenericKernel = *reinterpret_cast(EntryPtr); if (RecordReplay.isRecording()) RecordReplay.saveKernelInputInfo( - GenericKernel.getName(), ArgPtrs, ArgOffsets, KernelArgs.NumArgs, - KernelArgs.NumTeams[0], KernelArgs.ThreadLimit[0], KernelArgs.Tripcount, - AsyncInfoWrapper); + GenericKernel.getName(), GenericKernel.getImage(), ArgPtrs, ArgOffsets, + KernelArgs.NumArgs, KernelArgs.NumTeams[0], KernelArgs.ThreadLimit[0], + KernelArgs.Tripcount); + + if (RecordReplay.isRecording()) + RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage()); auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs, AsyncInfoWrapper); + // 'finalize' here to guarantee next record-replay actions are in-sync + AsyncInfoWrapper.finalize(Err); + if (RecordReplay.isRecordingOrReplaying() && RecordReplay.isSaveOutputEnabled()) - RecordReplay.saveKernelOutputInfo(GenericKernel.getName(), - AsyncInfoWrapper); + RecordReplay.saveKernelOutputInfo(GenericKernel.getName()); - AsyncInfoWrapper.finalize(Err); return Err; } @@ -1358,6 +1424,28 @@ return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId); } +int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, + uint64_t MemorySize, bool isRecord, + bool SaveOutput) { + GenericPluginTy &Plugin = Plugin::get(); + GenericDeviceTy &Device = Plugin.getDevice(DeviceId); + RecordReplayTy::RRStatusTy Status = + isRecord ? RecordReplayTy::RRStatusTy::RRRecording + : RecordReplayTy::RRStatusTy::RRReplaying; + + if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) { + REPORT("WARNING RR did not intialize RR-properly with %lu bytes" + "(Error: %s)\n", + MemorySize, toString(std::move(Err)).data()); + RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated); + + if (!isRecord) { + return OFFLOAD_FAIL; + } + } + return OFFLOAD_SUCCESS; +} + __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *TgtImage) { GenericPluginTy &Plugin = Plugin::get(); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -17,6 +17,8 @@ #include "private.h" #include "rtl.h" +#include "Utilities.h" + #include #include #include @@ -530,6 +532,23 @@ if (Ret != OFFLOAD_SUCCESS) return; + // Enables recording kernels if set. + llvm::omp::target::BoolEnvar OMPX_RecordKernel("LIBOMPTARGET_RECORD", false); + if (OMPX_RecordKernel) { + // Enables saving the device memory kernel output post execution if set. + llvm::omp::target::BoolEnvar OMPX_ReplaySaveOutput( + "LIBOMPTARGET_RR_SAVE_OUTPUT", false); + // Sets the maximum to pre-allocate device memory. + llvm::omp::target::UInt64Envar OMPX_DeviceMemorySize( + "LIBOMPTARGET_RR_DEVMEM_SIZE", 16); + DP("Activating Record-Replay for Device %d with %lu GB memory\n", + RTLDeviceID, OMPX_DeviceMemorySize); + + RTL->activate_record_replay(RTLDeviceID, + OMPX_DeviceMemorySize * 1024 * 1024 * 1024, + true, OMPX_ReplaySaveOutput); + } + IsInit = true; } diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -28,6 +28,7 @@ __tgt_target_kernel_nowait; __tgt_target_nowait_query; __tgt_target_kernel_replay; + __tgt_activate_record_replay; __tgt_mapper_num_components; __tgt_push_mapper_component; __kmpc_push_target_tripcount; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -334,6 +334,28 @@ HostPtr, KernelArgs); } +/// Activates the record replay mechanism. +/// \param DeviceId The device identifier to execute the target region. +/// \param MemorySize The number of bytes to be (pre-)allocated +/// by the bump allocator +/// /param IsRecord Activates the record replay mechanism in +/// 'record' mode or 'replay' mode. +/// /param SaveOutput Store the device memory after kernel +/// execution on persistent storage +EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, + bool IsRecord, bool SaveOutput) { + if (!deviceIsReady(DeviceId)) { + DP("Device %" PRId64 " is not ready\n", DeviceId); + return OMP_TGT_FAIL; + } + + DeviceTy &Device = *PM->Devices[DeviceId]; + int Rc = target_activate_rr(Device, MemorySize, IsRecord, SaveOutput); + assert(Rc == OFFLOAD_SUCCESS && + "__tgt_activate_record_replay unexpected failure!"); + return OMP_TGT_SUCCESS; +}; + /// Implements a target kernel entry that replays a pre-recorded kernel. /// \param Loc Source location associated with this target region (unused). /// \param DeviceId The device identifier to execute the target region. diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -1712,6 +1712,15 @@ return OFFLOAD_SUCCESS; } +/// Enables the record replay mechanism by pre-allocating MemorySize +/// and informing the record-replayer of whether to store the output +/// in some file. +int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord, + bool SaveOutput) { + return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, + isRecord, SaveOutput); +} + /// Executes a kernel using pre-recorded information for loading to /// device memory to launch the target kernel with the pre-recorded /// configuration. diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -41,6 +41,9 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo); +extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, + bool isRecord, bool SaveOutput); + extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -250,6 +250,10 @@ *((void **)&RTL.data_notify_unmapped) = DynLibrary->getAddressOfSymbol("__tgt_rtl_data_notify_unmapped"); + // Record Replay RTL + *((void **)&RTL.activate_record_replay) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_initialize_record_replay"); + RTL.LibraryHandler = std::move(DynLibrary); // Successfully loaded diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp --- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp +++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "omptarget.h" #include "omptargetplugin.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/JSON.h" @@ -111,24 +112,10 @@ Desc.HostEntriesEnd = &KernelEntry + 1; Desc.DeviceImages = &DeviceImage; - ErrorOr> DeviceMemoryMB = - MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false, - /* RequiresNullTerminator */ false); - if (!DeviceMemoryMB) - report_fatal_error("Error reading the kernel input device memory."); - - setenv("LIBOMPTARGET_REPLAY", "1", 1); - if (VerifyOpt || SaveOutputOpt) - setenv("LIBOMPTARGET_RR_SAVE_OUTPUT", "1", 1); - auto DeviceMemorySizeJson = JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize"); // Set device memory size to the ceiling of GB granularity. - uint64_t DeviceMemorySize = - std::ceil(DeviceMemorySizeJson.value() / (1024.0 * 1024.0 * 1024.0)); - - setenv("LIBOMPTARGET_RR_DEVMEM_SIZE", - std::to_string(DeviceMemorySize).c_str(), 1); + uint64_t DeviceMemorySize = std::ceil(DeviceMemorySizeJson.value()); auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId"); // TODO: Print warning if the user overrides the device id in the json file. @@ -137,13 +124,31 @@ // TODO: do we need requires? //__tgt_register_requires(/* Flags */1); - __tgt_init_all_rtls(); - __tgt_register_lib(&Desc); + int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false, + VerifyOpt); + + if (Rc != OMP_TGT_SUCCESS) { + report_fatal_error("Cannot activate record replay\n"); + } + + ErrorOr> DeviceMemoryMB = + MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false, + /* RequiresNullTerminator */ false); + + if (!DeviceMemoryMB) + report_fatal_error("Error reading the kernel input device memory."); + + // On AMD for currently unknown reasons we cannot copy memory mapped data to + // device. This is a work-around. + uint8_t *recored_data = new uint8_t[DeviceMemoryMB.get()->getBufferSize()]; + std::memcpy(recored_data, + const_cast(DeviceMemoryMB.get()->getBuffer().data()), + DeviceMemorySizeJson.value() * sizeof(uint8_t)); + __tgt_target_kernel_replay( - /* Loc */ nullptr, DeviceId, KernelEntry.addr, - const_cast(DeviceMemoryMB.get()->getBuffer().data()), + /* Loc */ nullptr, DeviceId, KernelEntry.addr, (char *)recored_data, DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(), TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads, LoopTripCount.value()); @@ -171,6 +176,9 @@ outs() << "[llvm-omp-kernel-replay] Replay device memory failed to " "verify!\n"; } + + delete[] recored_data; + // TODO: calling unregister lib causes plugin deinit error for nextgen // plugins. //__tgt_unregister_lib(&Desc);