diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -346,6 +346,11 @@ __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, void *NoAliasDepList); +int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr, + void *DeviceMemory, int64_t DeviceMemorySize, + void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount); void __tgt_set_info_flag(uint32_t); diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt @@ -20,6 +20,7 @@ add_definitions(-DDEBUG_PREFIX="PluginInterface") set_property(TARGET PluginInterface PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET PluginInterface PROPERTY CXX_VISIBILITY_PRESET protected) llvm_update_compile_flags(PluginInterface) set(LINK_LLVM_LIBS LLVMSupport) if (LLVM_LINK_LLVM_DYLIB) diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -369,6 +369,9 @@ } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } + bool isRecording() const { return RecordKernel; } + bool isReplaying() const { return ReplayKernel; } + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, @@ -409,6 +412,15 @@ UInt64Envar OMPX_TargetStackSize; UInt64Envar OMPX_TargetHeapSize; + // Environment variables for record and replay. + BoolEnvar RecordKernel; + BoolEnvar ReplayKernel; + + // Memory pointers for recording, replaying memory. + void *MemoryStart; + void *MemoryPtr; + size_t MemorySize; + /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -15,6 +15,10 @@ #include "omptarget.h" #include "omptargetplugin.h" +#include "llvm/Support/Base64.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBuffer.h" + #include #include @@ -43,6 +47,15 @@ DynamicMemorySize = GenericDevice.getDynamicMemorySize(); + if (GenericDevice.isRecording()) { + std::error_code EC; + Twine ImageName = Twine(Name) + Twine(".image"); + raw_fd_ostream OS(ImageName.str(), EC); + // TODO: check for errors. + OS << Image.getMemoryBuffer().getBuffer(); + OS.close(); + } + return initImpl(GenericDevice, Image); } @@ -140,8 +153,9 @@ // Do not initialize the following two envars since they depend on the // device initialization. These cannot be consulted until the device is // initialized correctly. We intialize them in GenericDeviceTy::init(). - OMPX_TargetStackSize(), OMPX_TargetHeapSize(), MemoryManager(nullptr), - DeviceId(DeviceId), GridValues(OMPGridValues), + OMPX_TargetStackSize(), OMPX_TargetHeapSize(), + RecordKernel("LIBOMPTARGET_RECORD"), ReplayKernel("LIBOMPTARGET_REPLAY"), + MemoryManager(nullptr), DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock() { if (OMP_NumTeams > 0) GridValues.GV_Max_Teams = @@ -180,6 +194,24 @@ if (EnableMM) MemoryManager = new MemoryManagerTy(*this, ThresholdMM); + if (isRecording()|| isReplaying()) { + // Pre-allocate memory on device. + constexpr size_t MAX_MEMORY_ALLOCATION = 8 * 1024 * 1024 * 1024ULL; + constexpr size_t STEP = 1024 * 1024 * 1024ULL; + MemoryStart = nullptr; + for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) { + MemoryStart = allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT); + if (MemoryStart) + break; + } + + if (!MemoryStart) + return Plugin::error("Allocating recording memory"); + + MemoryPtr = MemoryStart; + MemorySize = 0; + } + return Plugin::success(); } @@ -349,6 +381,16 @@ TargetAllocTy Kind) { void *Alloc = nullptr; + if (isRecording() || isReplaying()) { + constexpr int ALIGN = 16; + // Assumes alignment is a power of 2. + int64_t AlignedSize = Size + (ALIGN - 1) & (~(ALIGN - 1)); + Alloc = MemoryPtr; + MemoryPtr = (char *)MemoryPtr + AlignedSize; + MemorySize += AlignedSize; + return Alloc; + } + switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: @@ -377,6 +419,10 @@ } Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { + // When recording free is a noop. + if (isRecording() || isReplaying()) + return Plugin::success(); + int Res; if (MemoryManager) Res = MemoryManager->free(TgtPtr); @@ -424,9 +470,78 @@ GenericKernelTy &GenericKernel = *reinterpret_cast(EntryPtr); + if (isRecording()) { + json::Object JsonKernelInfo; + JsonKernelInfo["Name"] = GenericKernel.getName(); + JsonKernelInfo["NumArgs"] = NumArgs; + JsonKernelInfo["NumTeamsClause"] = NumTeamsClause; + JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause; + JsonKernelInfo["LoopTripCount"] = LoopTripCount; + + json::Array JsonArgPtrs; + for (int I = 0; I < NumArgs; ++I) + JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]); + JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs)); + + json::Array JsonArgOffsets; + for (int I = 0; I < NumArgs; ++I) + JsonArgOffsets.push_back(ArgOffsets[I]); + JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets)); + + json::Array JsonMemoryInfo; + ErrorOr> DeviceMemoryMB = + WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); + if (!DeviceMemoryMB) + report_fatal_error("Create MB for allocated data pre kernel execution"); + + auto Err = dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), MemoryStart, + MemorySize, AsyncInfoWrapper); + if (Err) + report_fatal_error("Error retrieving data for target pointer"); + + StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize); + + std::error_code EC; + Twine MemoryFilename = Twine(GenericKernel.getName()) + Twine(".memory"); + raw_fd_ostream MemoryOS(MemoryFilename.str(), EC); + MemoryOS << DeviceMemory; + MemoryOS.close(); + + Twine JsonFilename = Twine(GenericKernel.getName()) + Twine(".json"); + raw_fd_ostream JsonOS(JsonFilename.str(), EC); + JsonOS << json::Value(std::move(JsonKernelInfo)); + JsonOS.close(); + } + Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause, ThreadLimitClause, LoopTripCount, AsyncInfoWrapper); + + if (isRecording() || isReplaying()) { + ErrorOr> DeviceMemoryMB = + WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); + if (!DeviceMemoryMB) + report_fatal_error( + "Create MB for retrieving allocated data post kernel execution"); + + auto Err = dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), MemoryStart, + MemorySize, AsyncInfoWrapper); + if (Err) + report_fatal_error("Error retrieving data for target pointer"); + + StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize); + + std::error_code EC; + std::string OutputFilename(GenericKernel.getName()); + if (isRecording()) + OutputFilename += ".golden"; + else /* isReplaying */ + OutputFilename += ".replay"; + raw_fd_ostream OS(OutputFilename, EC); + OS << DeviceMemory; + OS.close(); + } + return Err; } diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -26,6 +26,7 @@ __tgt_target_teams_nowait_mapper; __tgt_target_kernel; __tgt_target_kernel_nowait; + __tgt_target_kernel_replay; __tgt_mapper_num_components; __tgt_push_mapper_component; __kmpc_push_target_tripcount; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -255,6 +255,30 @@ return OMP_TGT_SUCCESS; } +EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, + void *HostPtr, void *DeviceMemory, + int64_t DeviceMemorySize, void **TgtArgs, + ptrdiff_t *TgtOffsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount) { + + if (checkDeviceAndCtors(DeviceId, Loc)) { + DP("Not offloading to device %" PRId64 "\n", DeviceId); + return OMP_TGT_FAIL; + } + DeviceTy &Device = *PM->Devices[DeviceId]; + + AsyncInfoTy AsyncInfo(Device); + int Rc = target_replay(Loc, Device, HostPtr, DeviceMemory, DeviceMemorySize, + TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, + LoopTripCount, AsyncInfo); + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel_replay unexpected failure!"); + return OMP_TGT_SUCCESS; +} + EXTERN int __tgt_target_kernel_nowait( ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit, void *HostPtr, __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList, diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -1578,3 +1578,50 @@ return OFFLOAD_SUCCESS; } + +int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, + void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, + ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount, + AsyncInfoTy &AsyncInfo) { + int32_t DeviceId = Device.DeviceID; + TableMap *TM = getTableMap(HostPtr); + // No map for this host pointer found! + if (!TM) { + REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", + DPxPTR(HostPtr)); + return OFFLOAD_FAIL; + } + + // get target table. + __tgt_target_table *TargetTable = nullptr; + { + std::lock_guard TrlTblLock(PM->TrlTblMtx); + assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && + "Not expecting a device ID outside the table's bounds!"); + TargetTable = TM->Table->TargetsTable[DeviceId]; + } + assert(TargetTable && "Global data has not been mapped\n"); + + // Launch device execution. + void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; + DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", + TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); + + void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr, + TARGET_ALLOC_DEFAULT); + Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); + + int Ret; + { + Ret = Device.runTeamRegion(TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, + NumTeams, ThreadLimit, LoopTripCount, AsyncInfo); + } + + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Executing target region abort target.\n"); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -45,6 +45,12 @@ uint64_t Tripcount, int IsTeamConstruct, AsyncInfoTy &AsyncInfo); +extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, + void *DeviceMemory, int64_t DeviceMemorySize, + void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo); + extern void handleTargetOutcome(bool Success, ident_t *Loc); extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc); extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, diff --git a/openmp/libomptarget/tools/CMakeLists.txt b/openmp/libomptarget/tools/CMakeLists.txt --- a/openmp/libomptarget/tools/CMakeLists.txt +++ b/openmp/libomptarget/tools/CMakeLists.txt @@ -25,3 +25,4 @@ endmacro() add_subdirectory(deviceinfo) +add_subdirectory(kernelreplay) diff --git a/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt b/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt @@ -0,0 +1,26 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build llvm-omp-kernel-replay tool +# +##===----------------------------------------------------------------------===## + +libomptarget_say("Building the llvm-omp-kernel-replay tool") + +add_openmp_tool(llvm-omp-kernel-replay llvm-omp-kernel-replay.cpp) + +llvm_update_compile_flags(llvm-omp-kernel-replay) + +target_include_directories(llvm-omp-kernel-replay PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} +) +target_link_libraries(llvm-omp-kernel-replay PRIVATE + LLVMSupport + omp + omptarget +) diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -0,0 +1,143 @@ +//===- llvm-omp-kernel-replay.cpp - Replay OpenMP offload kernel ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a command line utility to replay the execution of recorded OpenMP +// offload kernels. +// +//===----------------------------------------------------------------------===// + +#include "omptarget.h" +#include "omptargetplugin.h" +#include "rtl.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Base64.h" +#include "llvm/Support/CommandLine.h" +#include + +using namespace llvm; + +cl::OptionCategory ReplayOptions("llvm-omp-kernel-replay Options"); + +// InputFilename - The filename to read the json description of the kernel. +static cl::opt InputFilename(cl::Positional, + cl::desc(""), + cl::Required); + +static cl::opt + Verify("verify", + cl::desc("Verify device memory post execution against golden."), + cl::init(false), cl::cat(ReplayOptions)); + +static cl::opt NumTeamsOpt("num-teams", + cl::desc("Set the number of teams."), + cl::init(0), cl::cat(ReplayOptions)); + +static cl::opt NumThreadsOpt("num-threads", + cl::desc("Set the number of threads."), + cl::init(0), cl::cat(ReplayOptions)); + +int main(int argc, char **argv) { + cl::HideUnrelatedOptions(ReplayOptions); + cl::ParseCommandLineOptions(argc, argv, "llvm-omp-kernel-replay\n"); + + ErrorOr> KernelInfoMB = + MemoryBuffer::getFile(InputFilename, /* isText */ true, + /* RequiresNullTerminator */ true); + if (!KernelInfoMB) + report_fatal_error("Error create MB for kernel info"); + Expected JsonKernelInfo = + json::parse(KernelInfoMB.get()->getBuffer()); + if (auto Err = JsonKernelInfo.takeError()) + report_fatal_error("Cannot load kernel info json file"); + + auto NumTeamsJson = + JsonKernelInfo->getAsObject()->getInteger("NumTeamsClause"); + unsigned NumTeams = (NumTeamsOpt > 0 ? NumTeamsOpt : NumTeamsJson.value()); + auto NumThreadsJson = + JsonKernelInfo->getAsObject()->getInteger("ThreadLimitClause"); + unsigned NumThreads = + (NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value()); + auto LoopTripCount = + JsonKernelInfo->getAsObject()->getInteger("LoopTripCount"); + auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name"); + + SmallVector TgtArgs; + SmallVector TgtArgOffsets; + auto NumArgs = JsonKernelInfo->getAsObject()->getInteger("NumArgs"); + auto *TgtArgsArray = JsonKernelInfo->getAsObject()->getArray("ArgPtrs"); + for (auto It : *TgtArgsArray) + TgtArgs.push_back(reinterpret_cast(It.getAsInteger().value())); + auto *TgtArgOffsetsArray = + JsonKernelInfo->getAsObject()->getArray("ArgOffsets"); + for (auto It : *TgtArgOffsetsArray) + TgtArgOffsets.push_back( + reinterpret_cast(It.getAsInteger().value())); + + __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0}; + std::string KernelEntryName = KernelFunc.value().str(); + KernelEntry.name = const_cast(KernelEntryName.c_str()); + // Anything non-zero works to uniquely identify the kernel. + KernelEntry.addr = (void *)0x1; + + ErrorOr> ImageMB = + MemoryBuffer::getFile(KernelEntryName + ".image", /* isText */ false, + /* RequiresNullTerminator */ false); + + __tgt_device_image DeviceImage; + DeviceImage.ImageStart = (void *)ImageMB.get()->getBufferStart(); + DeviceImage.ImageEnd = (void *)ImageMB.get()->getBufferEnd(); + DeviceImage.EntriesBegin = &KernelEntry; + DeviceImage.EntriesEnd = &KernelEntry + 1; + + __tgt_bin_desc Desc; + Desc.NumDeviceImages = 1; + Desc.HostEntriesBegin = &KernelEntry; + Desc.HostEntriesEnd = &KernelEntry + 1; + Desc.DeviceImages = &DeviceImage; + + ErrorOr> DeviceMemoryMB = + MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false, + /* RequiresNullTerminator */ false); + + setenv("LIBOMPTARGET_REPLAY", "1", 1); + + // TODO: do we need requires? + //__tgt_register_requires(/* Flags */1); + + __tgt_init_all_rtls(); + + __tgt_register_lib(&Desc); + + __tgt_target_kernel_replay( + /* Loc */ nullptr, /* DeviceId */ 0, KernelEntry.addr, + (void *)DeviceMemoryMB.get()->getBuffer().data(), + DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(), + TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads, + LoopTripCount.value()); + + if (Verify) { + ErrorOr> GoldenMB = + MemoryBuffer::getFile(KernelEntryName + ".golden", /* isText */ false, + /* RequiresNullTerminator */ false); + ErrorOr> ReplayMB = + MemoryBuffer::getFile(KernelEntryName + ".replay", /* isText */ false, + /* RequiresNullTerminator */ false); + StringRef Golden = GoldenMB.get()->getBuffer(); + StringRef Replay = ReplayMB.get()->getBuffer(); + if (Golden == Replay) + outs() << "[llvm-omp-kernel-replay] Replay device memory verified!\n"; + else + outs() << "[llvm-omp-kernel-replay] Replay device memory failed to verify!\n"; + } + // TODO: calling unregister lib causes plugin deinit error for nextgen plugins. + //__tgt_unregister_lib(&Desc); + + return 0; +}