diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -468,6 +468,9 @@ /// Destroy the event. int32_t destroyEvent(void *Event); + + /// Update recorded globals to their values described in \p Globals. + int32_t initializeRecordedGlobals(const void *Globals, int32_t Size); /// } private: diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -422,7 +422,8 @@ void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, - int32_t ThreadLimit, uint64_t LoopTripCount); + int32_t ThreadLimit, uint64_t LoopTripCount, + const void *Globals, int64_t GlobalsSize); void __tgt_set_info_flag(uint32_t); diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -74,10 +74,12 @@ typedef int32_t(destroy_event_ty)(int32_t, void *); typedef int32_t(release_async_info_ty)(int32_t, __tgt_async_info *); typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **); - typedef int64_t(init_device_into_ty)(int64_t, __tgt_device_info *, + typedef int64_t(init_device_into_ty)(int32_t, __tgt_device_info *, const char **); typedef int32_t(data_lock_ty)(int32_t, void *, int64_t, void **); typedef int32_t(data_unlock_ty)(int32_t, void *); + typedef int32_t(initialize_recorded_globals_ty)(int32_t, const void *, + int32_t); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -131,6 +133,7 @@ release_async_info_ty *release_async_info = nullptr; data_lock_ty *data_lock = nullptr; data_unlock_ty *data_unlock = nullptr; + initialize_recorded_globals_ty *initialize_recorded_globals = nullptr; // Are there images associated with this RTL. bool IsUsed = false; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -105,6 +105,11 @@ private: __tgt_target_table TTTablePtr; llvm::SmallVector<__tgt_offload_entry> Entries; + + public: + using const_iterator = decltype(Entries)::const_iterator; + const_iterator begin() const { return Entries.begin(); } + const_iterator end() const { return Entries.end(); } }; /// Image identifier within the corresponding device. Notice that this id is @@ -154,6 +159,9 @@ /// Get a reference to the offload entry table for the image. OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; } + const OffloadEntryTableTy &getOffloadEntryTable() const { + return OffloadEntryTable; + } }; /// Class implementing common functionalities of offload kernels. Each plugin @@ -186,6 +194,12 @@ /// Get the kernel name. const char *getName() const { return Name; } + /// Get the kernel name. + const DeviceImageTy &getImage() const { + assert(ImagePtr && "Kernel is not initialized!"); + return *ImagePtr; + } + /// Indicate whether an execution mode is valid. static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { switch (ExecutionMode) { @@ -245,6 +259,9 @@ /// The execution flags of the kernel. OMPTgtExecModeFlags ExecutionMode; + /// The image that contains this kernel. + DeviceImageTy *ImagePtr = nullptr; + protected: /// The dynamic memory size reserved for executing the kernel. uint32_t DynamicMemorySize; @@ -398,6 +415,9 @@ return std::move(MB); } + /// Initialize recorded globals encoded by \p GlobalEncoding. + Error initializeRecordedGlobals(const void *GlobalEncoding, int32_t Size); + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -12,10 +12,12 @@ #include "Debug.h" #include "GlobalHandler.h" #include "JIT.h" +#include "Utilities.h" #include "elf_common.h" #include "omptarget.h" #include "omptargetplugin.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/Error.h" #include "llvm/Support/JSON.h" @@ -34,11 +36,15 @@ // TODO: Fix any thread safety issues for multi-threaded kernel recording. struct RecordReplayTy { private: + static constexpr int ALIGN = 16; + // Memory pointers for recording, replaying memory. - void *MemoryStart; - void *MemoryPtr; - size_t MemorySize; - GenericDeviceTy *Device; + void *MemoryStart = nullptr; + void *AlignedMemoryStart = nullptr; + void *MemoryPtr = nullptr; + size_t MemorySize = 0; + GenericDeviceTy *Device = nullptr; + std::mutex AllocationLock; // Environment variables for record and replay. @@ -59,37 +65,106 @@ Error preallocateDeviceMemory() { // Pre-allocate memory on device. Starts with 64GB and subtracts in steps // of 1GB until allocation succeeds. - const size_t MAX_MEMORY_ALLOCATION = + const size_t MaxMemoryAllocation = OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL; constexpr size_t STEP = 1024 * 1024 * 1024ULL; MemoryStart = nullptr; - for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) { + size_t Size = MaxMemoryAllocation; + for (; Size > 0; Size -= STEP) { MemoryStart = - Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT); + Device->allocate(Size, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT); if (MemoryStart) break; } - if (!MemoryStart) + if (!MemoryStart && MaxMemoryAllocation) return Plugin::error("Allocating record/replay memory"); - MemoryPtr = MemoryStart; + // Align the memory at two times the step size to avoid mismatch in the + // beginning of the memory region. + AlignedMemoryStart = alignPtr(MemoryStart, (2 * STEP)); + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), + "Allocated %" PRIu64 + " bytes at %p for record and replay, aligned it to %p\n", + Size, MemoryStart, AlignedMemoryStart); + + MemoryPtr = AlignedMemoryStart; MemorySize = 0; return Plugin::success(); } - void dumpDeviceMemory(StringRef Filename, - AsyncInfoWrapperTy &AsyncInfoWrapper) { + void dumpGlobals(StringRef Filename, const DeviceImageTy &Image) { + int32_t Size = 0; + + for (auto &OffloadEntry : Image.getOffloadEntryTable()) { + if (!OffloadEntry.size) + continue; + Size += std::strlen(OffloadEntry.name) + /* '\0' */ 1 + + /* OffloadEntry.size value */ sizeof(uint32_t) + + OffloadEntry.size; + } + + ErrorOr> GlobalsMB = + WritableMemoryBuffer::getNewUninitMemBuffer(Size); + if (!GlobalsMB) + report_fatal_error("Error creating MemoryBuffer for globals memory"); + + void *BufferPtr = GlobalsMB.get()->getBufferStart(); + for (auto &OffloadEntry : Image.getOffloadEntryTable()) { + if (!OffloadEntry.size) + continue; + + int32_t NameLength = std::strlen(OffloadEntry.name) + 1; + memcpy(BufferPtr, OffloadEntry.name, NameLength); + BufferPtr = advanceVoidPtr(BufferPtr, NameLength); + + *((uint32_t *)(BufferPtr)) = OffloadEntry.size; + BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); + + auto Err = Plugin::success(); + { + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *Device, nullptr); + if (auto Err = + Device->dataRetrieve(BufferPtr, OffloadEntry.addr, + OffloadEntry.size, AsyncInfoWrapper)) + report_fatal_error("Error retrieving data for global"); + } + if (Err) + report_fatal_error("Error retrieving data for global"); + BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size); + } + assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && + "Buffer over/under-filled."); + assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) && + "Buffer size mismatch"); + + StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size); + std::error_code EC; + raw_fd_ostream OS(Filename, EC); + OS << GlobalsMemory; + OS.close(); + } + + void dumpDeviceMemory(StringRef Filename) { ErrorOr> DeviceMemoryMB = WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); if (!DeviceMemoryMB) report_fatal_error("Error creating MemoryBuffer for device memory"); - auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), - MemoryStart, MemorySize, AsyncInfoWrapper); - if (Err) - report_fatal_error("Error retrieving data for target pointer"); + if (MemorySize) { + auto Err = Plugin::success(); + { + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *Device, nullptr); + if (auto Err = Device->dataRetrieve( + DeviceMemoryMB.get()->getBufferStart(), AlignedMemoryStart, + MemorySize, AsyncInfoWrapper)) + report_fatal_error("Error retrieving data for target pointer"); + } + if (Err) + report_fatal_error("Error retrieving data for target pointer"); + } StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize); std::error_code EC; @@ -134,11 +209,10 @@ OS.close(); } - void saveKernelInputInfo(const char *Name, void **ArgPtrs, - ptrdiff_t *ArgOffsets, int32_t NumArgs, - uint64_t NumTeamsClause, uint32_t ThreadLimitClause, - uint64_t LoopTripCount, - AsyncInfoWrapperTy &AsyncInfoWrapper) { + void saveKernelInputInfo(const char *Name, const DeviceImageTy &Image, + void **ArgPtrs, ptrdiff_t *ArgOffsets, + int32_t NumArgs, uint64_t NumTeamsClause, + uint32_t ThreadLimitClause, uint64_t LoopTripCount) { json::Object JsonKernelInfo; JsonKernelInfo["Name"] = Name; JsonKernelInfo["NumArgs"] = NumArgs; @@ -160,7 +234,10 @@ Twine KernelName(Name); Twine MemoryFilename = KernelName + ".memory"; - dumpDeviceMemory(MemoryFilename.str(), AsyncInfoWrapper); + dumpDeviceMemory(MemoryFilename.str()); + + Twine GlobalUpdatesFilename = KernelName + ".globals"; + dumpGlobals(GlobalUpdatesFilename.str(), Image); Twine JsonFilename = KernelName + ".json"; std::error_code EC; @@ -172,22 +249,20 @@ JsonOS.close(); } - void saveKernelOutputInfo(const char *Name, - AsyncInfoWrapperTy &AsyncInfoWrapper) { + void saveKernelOutputInfo(const char *Name) { Twine OutputFilename = Twine(Name) + (isRecording() ? ".original.output" : ".replay.output"); - dumpDeviceMemory(OutputFilename.str(), AsyncInfoWrapper); + dumpDeviceMemory(OutputFilename.str()); } void *alloc(uint64_t Size) { - assert(MemoryStart && "Expected memory has been pre-allocated"); + assert(Size && MemoryStart && "Expected memory has been pre-allocated"); void *Alloc = nullptr; - constexpr int ALIGN = 16; // Assumes alignment is a power of 2. int64_t AlignedSize = Size + (ALIGN - 1) & (~(ALIGN - 1)); std::lock_guard LG(AllocationLock); Alloc = MemoryPtr; - MemoryPtr = (char *)MemoryPtr + AlignedSize; + MemoryPtr = advanceVoidPtr(MemoryPtr, AlignedSize); MemorySize += AlignedSize; return Alloc; } @@ -197,6 +272,44 @@ return preallocateDeviceMemory(); } + Error initializeRecordedGlobals(DeviceImageTy &Image, const void *BufferPtr, + int32_t Size) { + StringMap OffloadEntryMap; + for (auto &OffloadEntry : Image.getOffloadEntryTable()) + OffloadEntryMap[OffloadEntry.name] = &OffloadEntry; + + GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); + + const void *BufferEndPtr = advanceVoidPtr(BufferPtr, Size); + while (BufferPtr != BufferEndPtr) { + StringRef Name((const char *)BufferPtr); + BufferPtr = advanceVoidPtr(BufferPtr, Name.size() + 1); + + uint32_t RecordedSize = *((const uint32_t *)(BufferPtr)); + BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); + + GlobalTy ImageGlobal(Name.str(), 0); + if (auto Err = + GHandler.getGlobalMetadataFromImage(*Device, Image, ImageGlobal)) + return Err; + + // Verify the global has the expected size to avoid memory corruption. + if (RecordedSize != ImageGlobal.getSize()) + report_fatal_error( + "Recorded global " + Name + + " has unexpected size: " + std::to_string(ImageGlobal.getSize()) + + " vs " + std::to_string(RecordedSize)); + + // Set the host pointer of the global such that we read the value from the + // recording. + ImageGlobal.setPtr(const_cast(BufferPtr)); + if (auto Err = GHandler.writeGlobalToDevice(*Device, Image, ImageGlobal)) + return Err; + BufferPtr = advanceVoidPtr(BufferPtr, ImageGlobal.getSize()); + } + return Plugin::success(); + } + void deinit() { Device->free(MemoryStart); } } RecordReplay; @@ -211,6 +324,7 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) { + ImagePtr = &Image; PreferredNumThreads = getDefaultNumThreads(GenericDevice); MaxNumThreads = GenericDevice.getThreadLimit(); @@ -706,25 +820,49 @@ void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, int32_t NumArgs, uint64_t NumTeamsClause, uint32_t ThreadLimitClause, uint64_t LoopTripCount, __tgt_async_info *AsyncInfo) { - auto Err = Plugin::success(); - AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo); + + bool RecordKernelOutput = RecordReplay.isRecordingOrReplaying() && + RecordReplay.isSaveOutputEnabled(); GenericKernelTy &GenericKernel = *reinterpret_cast(EntryPtr); - if (RecordReplay.isRecording()) + // Saving kernel input information in recording mode is synchronous as is the + // initialization of the device memory. + // TODO: If we delay the write of the files we could do this asynchronous: + // - schedule the memory H2D transfers, + // - schedule the kernel info D2H transfers, + // - schedule the kernel, + // - schedule the memory D2H transfers (iff the output is saved), + // - synchronize, + // - write all the files. + if (RecordReplay.isRecording()) { + // Ensure we are done transfering memory from the device before we issue D2H + // copies for the kernel info. + if (AsyncInfo && AsyncInfo->Queue) + if (auto Err = synchronize(AsyncInfo)) + return Err; RecordReplay.saveKernelInputInfo( - GenericKernel.getName(), ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause, - ThreadLimitClause, LoopTripCount, AsyncInfoWrapper); + GenericKernel.getName(), GenericKernel.getImage(), ArgPtrs, ArgOffsets, + NumArgs, NumTeamsClause, ThreadLimitClause, LoopTripCount); + } - Err = - GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause, - ThreadLimitClause, LoopTripCount, AsyncInfoWrapper); + // Block to synchronize the launch iff the kernel output is recorded. + auto Err = Plugin::success(); + { + AsyncInfoWrapperTy AsyncInfoWrapper( + Err, *this, RecordKernelOutput ? nullptr : AsyncInfo); - if (RecordReplay.isRecordingOrReplaying() && - RecordReplay.isSaveOutputEnabled()) - RecordReplay.saveKernelOutputInfo(GenericKernel.getName(), - AsyncInfoWrapper); + Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, + NumTeamsClause, ThreadLimitClause, LoopTripCount, + AsyncInfoWrapper); + } + + // If we are recording the output the kernel was already synchronized and is + // known to be done by now. If not, the AsyncInfoWrapper will not synchronize + // if AsyncInfo is a proper object. + if (RecordKernelOutput) + RecordReplay.saveKernelOutputInfo(GenericKernel.getName()); return Err; } @@ -746,6 +884,14 @@ return initDeviceInfoImpl(DeviceInfo); } +Error GenericDeviceTy::initializeRecordedGlobals(const void *GlobalEncoding, + int32_t Size) { + assert(GlobalEncoding && Size && "Expected encoded globals"); + assert(LoadedImages.size() == 1); + return RecordReplay.initializeRecordedGlobals(*LoadedImages.back(), + GlobalEncoding, Size); +} + Error GenericDeviceTy::printInfo() { // TODO: Print generic information here return printInfoImpl(); @@ -1235,6 +1381,18 @@ return OFFLOAD_SUCCESS; } +int32_t __tgt_rtl_initialize_recorded_globals(int32_t DeviceId, + const void *GlobalsEncoding, + int32_t Size) { + auto Err = Plugin::get().getDevice(DeviceId).initializeRecordedGlobals( + GlobalsEncoding, Size); + if (Err) + REPORT("Failure to initialize recorded globlas on device %d: %s\n", + DeviceId, toString(std::move(Err)).data()); + + return (bool)Err; +} + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -683,6 +683,12 @@ return OFFLOAD_SUCCESS; } +int32_t DeviceTy::initializeRecordedGlobals(const void *Globals, int32_t Size) { + assert(RTL->initialize_recorded_globals && + "Plugin does not support initialization of recorded globals."); + return RTL->initialize_recorded_globals(RTLDeviceID, Globals, Size); +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool deviceIsReady(int DeviceNum) { diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -287,7 +287,9 @@ int64_t DeviceMemorySize, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount) { + uint64_t LoopTripCount, + const void *UpdatedGlobals, + int64_t UpdatedGlobalsSize) { if (checkDeviceAndCtors(DeviceId, Loc)) { DP("Not offloading to device %" PRId64 "\n", DeviceId); @@ -298,7 +300,8 @@ AsyncInfoTy AsyncInfo(Device); int Rc = target_replay(Loc, Device, HostPtr, DeviceMemory, DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, - LoopTripCount, AsyncInfo); + LoopTripCount, UpdatedGlobals, UpdatedGlobalsSize, + AsyncInfo); if (Rc == OFFLOAD_SUCCESS) Rc = AsyncInfo.synchronize(); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -1722,6 +1722,7 @@ void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripCount, + const void *GlobalsEncoding, int32_t GlobalsEncodingSize, AsyncInfoTy &AsyncInfo) { int32_t DeviceId = Device.DeviceID; TableMap *TM = getTableMap(HostPtr); @@ -1749,9 +1750,14 @@ DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); - void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr, - TARGET_ALLOC_DEFAULT); - Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); + if (GlobalsEncodingSize) + Device.initializeRecordedGlobals(GlobalsEncoding, GlobalsEncodingSize); + + if (DeviceMemorySize) { + void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr, + TARGET_ALLOC_DEFAULT); + Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); + } int Ret = Device.runTeamRegion(TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams, diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -49,7 +49,8 @@ void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo); + uint64_t LoopTripCount, const void *UpdatedGlobals, + int32_t UpdatedGlobalsSize, AsyncInfoTy &AsyncInfo); extern void handleTargetOutcome(bool Success, ident_t *Loc); extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc); diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -250,6 +250,8 @@ DynLibrary->getAddressOfSymbol("__tgt_rtl_data_lock"); *((void **)&RTL.data_unlock) = DynLibrary->getAddressOfSymbol("__tgt_rtl_data_unlock"); + *((void **)&RTL.initialize_recorded_globals) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_initialize_recorded_globals"); RTL.LibraryHandler = std::move(DynLibrary); diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp --- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp +++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -111,12 +111,18 @@ Desc.HostEntriesEnd = &KernelEntry + 1; Desc.DeviceImages = &DeviceImage; + // Read in the entire file (hence the "as stream"), since we might lock the + // memory. ErrorOr> DeviceMemoryMB = - MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false, - /* RequiresNullTerminator */ false); + MemoryBuffer::getFileAsStream(KernelEntryName + ".memory"); if (!DeviceMemoryMB) report_fatal_error("Error reading the kernel input device memory."); + ErrorOr> GlobalsMB = + MemoryBuffer::getFileAsStream(KernelEntryName + ".globals"); + if (!GlobalsMB) + report_fatal_error("Error reading the globals state file."); + setenv("LIBOMPTARGET_REPLAY", "1", 1); if (VerifyOpt || SaveOutputOpt) setenv("LIBOMPTARGET_RR_SAVE_OUTPUT", "1", 1); @@ -127,8 +133,12 @@ uint64_t DeviceMemorySize = std::ceil(DeviceMemorySizeJson.value() / (1024.0 * 1024.0 * 1024.0)); - setenv("LIBOMPTARGET_RR_DEVMEM_SIZE", - std::to_string(DeviceMemorySize).c_str(), 1); + // TODO: Do not explicitly set device memory size to avoid messing the + // aligned memory start workaround. In the genral case, we will need + // to implement custom memory mapping to avoid misaligned memory + // addresses between recording and replaying a kernel. + // setenv("LIBOMPTARGET_RR_DEVMEM_SIZE", + // std::to_string(DeviceMemorySize).c_str(), 1); auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId"); // TODO: Print warning if the user overrides the device id in the json file. @@ -146,7 +156,8 @@ (void *)DeviceMemoryMB.get()->getBuffer().data(), DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(), TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads, - LoopTripCount.value()); + LoopTripCount.value(), GlobalsMB.get()->getBufferStart(), + GlobalsMB.get()->getBufferSize()); if (VerifyOpt) { ErrorOr> OriginalOutputMB =