diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -416,6 +416,14 @@ // data. void __tgt_target_nowait_query(void **AsyncHandle); +/// Executes a target kernel by replaying recorded kernel arguments and +/// device memory. +int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr, + void *DeviceMemory, int64_t DeviceMemorySize, + void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount); + void __tgt_set_info_flag(uint32_t); int __tgt_print_device_info(int64_t DeviceId); diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -18,6 +18,8 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/Error.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBuffer.h" #include #include @@ -29,6 +31,168 @@ GenericPluginTy *Plugin::SpecificPlugin = nullptr; +// TODO: Fix any thread safety issues for multi-threaded kernel recording. +struct RecordReplayTy { +private: + // Memory pointers for recording, replaying memory. + void *MemoryStart; + void *MemoryPtr; + size_t MemorySize; + GenericDeviceTy *Device; + std::mutex AllocationLock; + + // Environment variables for record and replay. + // Enables recording kernels if set. + BoolEnvar OMPX_RecordKernel; + // Enables replaying a kernel if set. + BoolEnvar OMPX_ReplayKernel; + // Enables saving the device memory kernel output post execution if set. + BoolEnvar OMPX_ReplaySaveOutput; + // Sets the maximum to pre-allocate device memory. + UInt32Envar OMPX_DeviceMemorySize; + + // Record/replay pre-allocates the largest possible device memory using the + // default kind. + // TODO: Expand allocation to include other kinds (device, host, shared) and + // possibly use a MemoryManager to track (de-)allocations for + // storing/retrieving when recording/replaying. + Error preallocateDeviceMemory() { + // Pre-allocate memory on device. Starts with 64GB and subtracts in steps + // of 1GB until allocation succeeds. + const size_t MAX_MEMORY_ALLOCATION = + OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL; + constexpr size_t STEP = 1024 * 1024 * 1024ULL; + MemoryStart = nullptr; + for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) { + MemoryStart = + Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT); + if (MemoryStart) + break; + } + + if (!MemoryStart) + return Plugin::error("Allocating record/replay memory"); + + MemoryPtr = MemoryStart; + MemorySize = 0; + + return Plugin::success(); + } + + void dumpDeviceMemory(StringRef Filename, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + ErrorOr> DeviceMemoryMB = + WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); + if (!DeviceMemoryMB) + report_fatal_error("Error creating MemoryBuffer for device memory"); + + auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), + MemoryStart, MemorySize, AsyncInfoWrapper); + if (Err) + report_fatal_error("Error retrieving data for target pointer"); + + StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize); + std::error_code EC; + raw_fd_ostream OS(Filename, EC); + if (EC) + report_fatal_error("Error dumping memory to file " + Filename + " :" + + EC.message()); + OS << DeviceMemory; + OS.close(); + } + +public: + bool isRecording() const { return OMPX_RecordKernel; } + bool isReplaying() const { return OMPX_ReplayKernel; } + bool isRecordingOrReplaying() const { + return (OMPX_RecordKernel || OMPX_ReplayKernel); + } + bool isSaveOutputEnabled() const { return OMPX_ReplaySaveOutput; } + + RecordReplayTy() + : OMPX_RecordKernel("LIBOMPTARGET_RECORD"), + OMPX_ReplayKernel("LIBOMPTARGET_REPLAY"), + OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT"), + OMPX_DeviceMemorySize("LIBOMPTARGET_RR_DEVMEM_SIZE", + /* Default in GB */ 64) {} + + void saveImage(const char *Name, DeviceImageTy &Image) { + Twine ImageName = Twine(Name) + Twine(".image"); + std::error_code EC; + raw_fd_ostream OS(ImageName.str(), EC); + if (EC) + report_fatal_error("Error saving image : " + StringRef(EC.message())); + OS << Image.getMemoryBuffer().getBuffer(); + OS.close(); + } + + void saveKernelInputInfo(const char *Name, void **ArgPtrs, + ptrdiff_t *ArgOffsets, int32_t NumArgs, + uint64_t NumTeamsClause, uint32_t ThreadLimitClause, + uint64_t LoopTripCount, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + json::Object JsonKernelInfo; + JsonKernelInfo["Name"] = Name; + JsonKernelInfo["NumArgs"] = NumArgs; + JsonKernelInfo["NumTeamsClause"] = NumTeamsClause; + JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause; + JsonKernelInfo["LoopTripCount"] = LoopTripCount; + JsonKernelInfo["DeviceMemorySize"] = MemorySize; + JsonKernelInfo["DeviceId"] = Device->getDeviceId(); + + json::Array JsonArgPtrs; + for (int I = 0; I < NumArgs; ++I) + JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]); + JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs)); + + json::Array JsonArgOffsets; + for (int I = 0; I < NumArgs; ++I) + JsonArgOffsets.push_back(ArgOffsets[I]); + JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets)); + + Twine KernelName(Name); + Twine MemoryFilename = KernelName + ".memory"; + dumpDeviceMemory(MemoryFilename.str(), AsyncInfoWrapper); + + Twine JsonFilename = KernelName + ".json"; + std::error_code EC; + raw_fd_ostream JsonOS(JsonFilename.str(), EC); + if (EC) + report_fatal_error("Error saving kernel json file : " + + StringRef(EC.message())); + JsonOS << json::Value(std::move(JsonKernelInfo)); + JsonOS.close(); + } + + void saveKernelOutputInfo(const char *Name, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + Twine OutputFilename = + Twine(Name) + (isRecording() ? ".original.output" : ".replay.output"); + dumpDeviceMemory(OutputFilename.str(), AsyncInfoWrapper); + } + + void *alloc(uint64_t Size) { + assert(MemoryStart && "Expected memory has been pre-allocated"); + void *Alloc = nullptr; + constexpr int ALIGN = 16; + // Assumes alignment is a power of 2. + int64_t AlignedSize = Size + (ALIGN - 1) & (~(ALIGN - 1)); + std::lock_guard LG(AllocationLock); + Alloc = MemoryPtr; + MemoryPtr = (char *)MemoryPtr + AlignedSize; + MemorySize += AlignedSize; + return Alloc; + } + + Error init(GenericDeviceTy *Device) { + this->Device = Device; + return preallocateDeviceMemory(); + } + + void deinit() { Device->free(MemoryStart); } + +} RecordReplay; + AsyncInfoWrapperTy::~AsyncInfoWrapperTy() { // If we used a local async info object we want synchronous behavior. // In that case, and assuming the current status code is OK, we will @@ -45,6 +209,9 @@ DynamicMemorySize = GenericDevice.getDynamicMemorySize(); + if (RecordReplay.isRecording()) + RecordReplay.saveImage(Name, Image); + return initImpl(GenericDevice, Image); } @@ -197,6 +364,10 @@ if (EnableMM) MemoryManager = new MemoryManagerTy(*this, ThresholdMM); + if (RecordReplay.isRecordingOrReplaying()) + if (auto Err = RecordReplay.init(this)) + return Err; + return Plugin::success(); } @@ -207,6 +378,9 @@ delete MemoryManager; MemoryManager = nullptr; + if (RecordReplay.isRecordingOrReplaying()) + RecordReplay.deinit(); + return deinitImpl(); } @@ -437,6 +611,9 @@ TargetAllocTy Kind) { void *Alloc = nullptr; + if (RecordReplay.isRecordingOrReplaying()) + return RecordReplay.alloc(Size); + switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: @@ -469,6 +646,10 @@ } Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { + // Free is a noop when recording or replaying. + if (RecordReplay.isRecordingOrReplaying()) + return Plugin::success(); + int Res; if (MemoryManager) Res = MemoryManager->free(TgtPtr); @@ -521,9 +702,20 @@ GenericKernelTy &GenericKernel = *reinterpret_cast(EntryPtr); + if (RecordReplay.isRecording()) + RecordReplay.saveKernelInputInfo( + GenericKernel.getName(), ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause, + ThreadLimitClause, LoopTripCount, AsyncInfoWrapper); + Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause, ThreadLimitClause, LoopTripCount, AsyncInfoWrapper); + + if (RecordReplay.isRecordingOrReplaying() && + RecordReplay.isSaveOutputEnabled()) + RecordReplay.saveKernelOutputInfo(GenericKernel.getName(), + AsyncInfoWrapper); + return Err; } diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -27,6 +27,7 @@ __tgt_target_kernel; __tgt_target_kernel_nowait; __tgt_target_nowait_query; + __tgt_target_kernel_replay; __tgt_mapper_num_components; __tgt_push_mapper_component; __kmpc_push_target_tripcount; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -265,6 +265,48 @@ HostPtr, Args); } +/// Implements a target kernel entry that replays a pre-recorded kernel. +/// \param Loc Source location associated with this target region (unused). +/// \param DeviceId The device identifier to execute the target region. +/// \param HostPtr A pointer to an address that uniquely identifies the kernel. +/// \param DeviceMemory A pointer to an array storing device memory data to move +/// prior to kernel execution. +/// \param DeviceMemorySize The size of the above device memory data in bytes. +/// \param TgtArgs An array of pointers of the pre-recorded target kernel +/// arguments. +/// \param TgtOffsets An array of pointers of the pre-recorded target kernel +/// argument offsets. +/// \param NumArgs The number of kernel arguments. +/// \param NumTeams Number of teams to launch the target region with. +/// \param ThreadLimit Limit to the number of threads to use in kernel +/// execution. +/// \param LoopTripCount The pre-recorded value of the loop tripcount, if any. +/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure. +EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, + void *HostPtr, void *DeviceMemory, + int64_t DeviceMemorySize, void **TgtArgs, + ptrdiff_t *TgtOffsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount) { + + if (checkDeviceAndCtors(DeviceId, Loc)) { + DP("Not offloading to device %" PRId64 "\n", DeviceId); + return OMP_TGT_FAIL; + } + DeviceTy &Device = *PM->Devices[DeviceId]; + + AsyncInfoTy AsyncInfo(Device); + int Rc = target_replay(Loc, Device, HostPtr, DeviceMemory, DeviceMemorySize, + TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, + LoopTripCount, AsyncInfo); + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + assert(Rc == OFFLOAD_SUCCESS && + "__tgt_target_kernel_replay unexpected failure!"); + return OMP_TGT_SUCCESS; +} + EXTERN int __tgt_target_kernel_nowait( ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit, void *HostPtr, __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList, diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -1714,3 +1714,53 @@ return OFFLOAD_SUCCESS; } + +/// Executes a kernel using pre-recorded information for loading to +/// device memory to launch the target kernel with the pre-recorded +/// configuration. +int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, + void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, + ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount, + AsyncInfoTy &AsyncInfo) { + int32_t DeviceId = Device.DeviceID; + TableMap *TM = getTableMap(HostPtr); + // Fail if the table map fails to find the target kernel pointer for the + // provided host pointer. + if (!TM) { + REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", + DPxPTR(HostPtr)); + return OFFLOAD_FAIL; + } + + // Retrieve the target table of offloading entries. + __tgt_target_table *TargetTable = nullptr; + { + std::lock_guard TrlTblLock(PM->TrlTblMtx); + assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && + "Not expecting a device ID outside the table's bounds!"); + TargetTable = TM->Table->TargetsTable[DeviceId]; + } + assert(TargetTable && "Global data has not been mapped\n"); + + // Retrieve the target kernel pointer, allocate and store the recorded device + // memory data, and launch device execution. + void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; + DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", + TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); + + void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr, + TARGET_ALLOC_DEFAULT); + Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); + + int Ret = + Device.runTeamRegion(TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams, + ThreadLimit, LoopTripCount, AsyncInfo); + + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Executing target region abort target.\n"); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -45,6 +45,12 @@ uint64_t Tripcount, int IsTeamConstruct, AsyncInfoTy &AsyncInfo); +extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, + void *DeviceMemory, int64_t DeviceMemorySize, + void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo); + extern void handleTargetOutcome(bool Success, ident_t *Loc); extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc); extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, diff --git a/openmp/libomptarget/tools/CMakeLists.txt b/openmp/libomptarget/tools/CMakeLists.txt --- a/openmp/libomptarget/tools/CMakeLists.txt +++ b/openmp/libomptarget/tools/CMakeLists.txt @@ -25,3 +25,4 @@ endmacro() add_subdirectory(deviceinfo) +add_subdirectory(kernelreplay) diff --git a/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt b/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt @@ -0,0 +1,26 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build llvm-omp-kernel-replay tool +# +##===----------------------------------------------------------------------===## + +libomptarget_say("Building the llvm-omp-kernel-replay tool") + +add_openmp_tool(llvm-omp-kernel-replay llvm-omp-kernel-replay.cpp) + +llvm_update_compile_flags(llvm-omp-kernel-replay) + +target_include_directories(llvm-omp-kernel-replay PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} +) +target_link_libraries(llvm-omp-kernel-replay PRIVATE + LLVMSupport + omp + omptarget +) diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -0,0 +1,179 @@ +//===- llvm-omp-kernel-replay.cpp - Replay OpenMP offload kernel ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a command line utility to replay the execution of recorded OpenMP +// offload kernels. +// +//===----------------------------------------------------------------------===// + +#include "omptargetplugin.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +using namespace llvm; + +cl::OptionCategory ReplayOptions("llvm-omp-kernel-replay Options"); + +// InputFilename - The filename to read the json description of the kernel. +static cl::opt InputFilename(cl::Positional, + cl::desc(""), + cl::Required); + +static cl::opt VerifyOpt( + "verify", + cl::desc( + "Verify device memory post execution against the original output."), + cl::init(false), cl::cat(ReplayOptions)); + +static cl::opt SaveOutputOpt( + "save-output", + cl::desc("Save the device memory output of the replayed kernel execution."), + cl::init(false), cl::cat(ReplayOptions)); + +static cl::opt NumTeamsOpt("num-teams", + cl::desc("Set the number of teams."), + cl::init(0), cl::cat(ReplayOptions)); + +static cl::opt NumThreadsOpt("num-threads", + cl::desc("Set the number of threads."), + cl::init(0), cl::cat(ReplayOptions)); + +static cl::opt DeviceIdOpt("device-id", cl::desc("Set the device id."), + cl::init(-1), cl::cat(ReplayOptions)); + +int main(int argc, char **argv) { + cl::HideUnrelatedOptions(ReplayOptions); + cl::ParseCommandLineOptions(argc, argv, "llvm-omp-kernel-replay\n"); + + ErrorOr> KernelInfoMB = + MemoryBuffer::getFile(InputFilename, /* isText */ true, + /* RequiresNullTerminator */ true); + if (!KernelInfoMB) + report_fatal_error("Error reading the kernel info json file"); + Expected JsonKernelInfo = + json::parse(KernelInfoMB.get()->getBuffer()); + if (auto Err = JsonKernelInfo.takeError()) + report_fatal_error("Cannot parse the kernel info json file"); + + auto NumTeamsJson = + JsonKernelInfo->getAsObject()->getInteger("NumTeamsClause"); + unsigned NumTeams = (NumTeamsOpt > 0 ? NumTeamsOpt : NumTeamsJson.value()); + auto NumThreadsJson = + JsonKernelInfo->getAsObject()->getInteger("ThreadLimitClause"); + unsigned NumThreads = + (NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value()); + // TODO: Print a warning if number of teams/threads is explicitly set in the + // kernel info but overriden through command line options. + auto LoopTripCount = + JsonKernelInfo->getAsObject()->getInteger("LoopTripCount"); + auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name"); + + SmallVector TgtArgs; + SmallVector TgtArgOffsets; + auto NumArgs = JsonKernelInfo->getAsObject()->getInteger("NumArgs"); + auto *TgtArgsArray = JsonKernelInfo->getAsObject()->getArray("ArgPtrs"); + for (auto It : *TgtArgsArray) + TgtArgs.push_back(reinterpret_cast(It.getAsInteger().value())); + auto *TgtArgOffsetsArray = + JsonKernelInfo->getAsObject()->getArray("ArgOffsets"); + for (auto It : *TgtArgOffsetsArray) + TgtArgOffsets.push_back( + reinterpret_cast(It.getAsInteger().value())); + + __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0}; + std::string KernelEntryName = KernelFunc.value().str(); + KernelEntry.name = const_cast(KernelEntryName.c_str()); + // Anything non-zero works to uniquely identify the kernel. + KernelEntry.addr = (void *)0x1; + + ErrorOr> ImageMB = + MemoryBuffer::getFile(KernelEntryName + ".image", /* isText */ false, + /* RequiresNullTerminator */ false); + if (!ImageMB) + report_fatal_error("Error reading the kernel image."); + + __tgt_device_image DeviceImage; + DeviceImage.ImageStart = (void *)ImageMB.get()->getBufferStart(); + DeviceImage.ImageEnd = (void *)ImageMB.get()->getBufferEnd(); + DeviceImage.EntriesBegin = &KernelEntry; + DeviceImage.EntriesEnd = &KernelEntry + 1; + + __tgt_bin_desc Desc; + Desc.NumDeviceImages = 1; + Desc.HostEntriesBegin = &KernelEntry; + Desc.HostEntriesEnd = &KernelEntry + 1; + Desc.DeviceImages = &DeviceImage; + + ErrorOr> DeviceMemoryMB = + MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false, + /* RequiresNullTerminator */ false); + if (!DeviceMemoryMB) + report_fatal_error("Error reading the kernel input device memory."); + + setenv("LIBOMPTARGET_REPLAY", "1", 1); + if (VerifyOpt || SaveOutputOpt) + setenv("LIBOMPTARGET_RR_SAVE_OUTPUT", "1", 1); + + auto DeviceMemorySizeJson = + JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize"); + // Set device memory size to the ceiling of GB granularity. + uint64_t DeviceMemorySize = + std::ceil(DeviceMemorySizeJson.value() / (1024.0 * 1024.0 * 1024.0)); + + setenv("LIBOMPTARGET_RR_DEVMEM_SIZE", + std::to_string(DeviceMemorySize).c_str(), 1); + + auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId"); + // TODO: Print warning if the user overrides the device id in the json file. + int32_t DeviceId = (DeviceIdOpt > -1 ? DeviceIdOpt : DeviceIdJson.value()); + + // TODO: do we need requires? + //__tgt_register_requires(/* Flags */1); + + __tgt_init_all_rtls(); + + __tgt_register_lib(&Desc); + + __tgt_target_kernel_replay( + /* Loc */ nullptr, DeviceId, KernelEntry.addr, + (void *)DeviceMemoryMB.get()->getBuffer().data(), + DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(), + TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads, + LoopTripCount.value()); + + if (VerifyOpt) { + ErrorOr> OriginalOutputMB = + MemoryBuffer::getFile(KernelEntryName + ".original.output", + /* isText */ false, + /* RequiresNullTerminator */ false); + if (!OriginalOutputMB) + report_fatal_error("Error reading the kernel original output file, make " + "sure LIBOMPTARGET_SAVE_OUTPUT is set when recording"); + ErrorOr> ReplayOutputMB = + MemoryBuffer::getFile(KernelEntryName + ".replay.output", + /* isText */ false, + /* RequiresNullTerminator */ false); + if (!ReplayOutputMB) + report_fatal_error("Error reading the kernel replay output file"); + + StringRef OriginalOutput = OriginalOutputMB.get()->getBuffer(); + StringRef ReplayOutput = ReplayOutputMB.get()->getBuffer(); + if (OriginalOutput == ReplayOutput) + outs() << "[llvm-omp-kernel-replay] Replay device memory verified!\n"; + else + outs() << "[llvm-omp-kernel-replay] Replay device memory failed to " + "verify!\n"; + } + // TODO: calling unregister lib causes plugin deinit error for nextgen + // plugins. + //__tgt_unregister_lib(&Desc); + + return 0; +}