diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -77,6 +79,35 @@ NONE }; +struct FunctionPropertiesInfo { + /// Number of basic blocks + int64_t BasicBlockCount = 0; + + /// Number of blocks reached from a conditional instruction, or that are + /// 'cases' of a SwitchInstr. + int64_t BlocksReachedFromConditionalInstruction = 0; + + /// Number of uses of this function, plus 1 if the function is callable + /// outside the module. + int64_t Uses = 0; + + /// Number of direct calls made from this function to other functions + /// defined in this module. + int64_t DirectCallsToDefinedFunctions = 0; + + // Load Instruction Count + int64_t LoadInstCount = 0; + + // Store Instruction Count + int64_t StoreInstCount = 0; + + // Maximum Loop Depth in the Function + int64_t MaxLoopDepth = 0; + + // Number of Top Level Loops in the Function + int64_t TopLevelLoopCount = 0; +}; + /// Use a single entity to encode a kernel and a set of flags. struct KernelTy { CUfunction Func; @@ -87,11 +118,14 @@ // 2 - SPMD mode execution with Generic mode semantics. int8_t ExecutionMode; + /// Numeric features of the kernel as extracted by FunctionPropertiesAnalysis + FunctionPropertiesInfo KernelFeatures; + /// Maximal number of threads per block for this kernel. int MaxThreadsPerBlock = 0; - KernelTy(CUfunction _Func, int8_t _ExecutionMode) - : Func(_Func), ExecutionMode(_ExecutionMode) {} + KernelTy(CUfunction _Func, int8_t _ExecutionMode, FunctionPropertiesInfo _KernelFeatures) + : Func(_Func), ExecutionMode(_ExecutionMode), KernelFeatures(_KernelFeatures) {} }; /// Device environment data @@ -810,7 +844,30 @@ CUDA_ERR_STRING(Err); } - KernelsList.emplace_back(Func, ExecModeVal); + std::string KernelFeaturesNameStr(E->name); + KernelFeaturesNameStr += "_KernelFeatures"; + const char *KernelFeaturesName = KernelFeaturesNameStr.c_str(); + CUdeviceptr KernelFeaturesPtr; + size_t KernelFeaturesSize; + Err = cuModuleGetGlobal(&KernelFeaturesPtr, &KernelFeaturesSize, Module, KernelFeaturesName); + FunctionPropertiesInfo KernelFeatures; + if (Err == CUDA_SUCCESS) { + Err = cuMemcpyDtoH(&KernelFeatures, KernelFeaturesPtr, KernelFeaturesSize); + assert(sizeof(KernelFeatures) == KernelFeaturesSize); + if (Err == CUDA_SUCCESS) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Kernel parameters for `%s`:\n", E->name); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Basic block count = %ld\n", KernelFeatures.BasicBlockCount); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Blocks reached from conditional = %ld\n", KernelFeatures.BlocksReachedFromConditionalInstruction); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Direct calls to defined functions = %ld\n", KernelFeatures.DirectCallsToDefinedFunctions); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Load instruction count = %ld\n", KernelFeatures.LoadInstCount); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Max loop depth = %ld\n", KernelFeatures.MaxLoopDepth); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Store instruction count = %ld\n", KernelFeatures.StoreInstCount); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Top-level loop count = %ld\n", KernelFeatures.TopLevelLoopCount); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Uses = %ld\n", KernelFeatures.Uses); + } + } + + KernelsList.emplace_back(Func, ExecModeVal, KernelFeatures); __tgt_offload_entry Entry = *E; Entry.addr = &KernelsList.back(); @@ -1086,22 +1143,132 @@ CudaBlocksPerGrid = TeamNum; } - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Launching kernel %s with %d blocks and %d threads in %s " - "mode\n", - (getOffloadEntry(DeviceId, TgtEntryPtr)) - ? getOffloadEntry(DeviceId, TgtEntryPtr)->name - : "(null)", - CudaBlocksPerGrid, CudaThreadsPerBlock, - (KernelInfo->ExecutionMode != SPMD - ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic") - : "SPMD")); - CUstream Stream = getStream(DeviceId, AsyncInfo); - Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, - /* gridDimZ */ 1, CudaThreadsPerBlock, - /* blockDimY */ 1, /* blockDimZ */ 1, - /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); + // The environment variable BENCHMARK_KERNEL serves as a flag for enabling/disabling the benchmarking functionality. + // If BENCHMARK_KERNEL is set, the kernel is executed with different grid parameters and the results are appended + // into a file given by the environment variable BENCHMARK_KERNEL_LOG. + // Otherwise the kernel is executed normally. + if (std::getenv("BENCHMARK_KERNEL")) { + const char* KernelName = getOffloadEntry(DeviceId, TgtEntryPtr) ? getOffloadEntry(DeviceId, TgtEntryPtr)->name : "(null)"; + // The range for "number of blocks per grid" is evaluated as follows: + // - If parameter TeamNum is set to a positive value, then the range degenerates into TeamNum. + // - Otherwise the range is given as all powers of 2 from 1 to maximum supported number of blocks per grid. + // E.g., if the devide supports up to 65536 blocks per grid, then the benchmarking code will run the kernel + // with 1 block per grid, 2 blocks per grid, 4 blocks per grid, ... 32768 blocks per grid and 65536 blocks per grid. + int MinBlocksPerGrid, MaxBlocksPerGrid; + if (TeamNum <= 0) { + MinBlocksPerGrid = 1; + MaxBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Running kernel `%s` using from %d to %d CUDA blocks\n", + KernelName, MinBlocksPerGrid, MaxBlocksPerGrid); + } + else { + MinBlocksPerGrid = MaxBlocksPerGrid = CudaBlocksPerGrid; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Running kernel `%s` using the requested number of CUDA blocks (%d)\n", + KernelName, CudaBlocksPerGrid); + } + // The range for "number of blocks per grid" is defined similarly to the range for "number of threads per block". + int MinThreadsPerBlock, MaxThreadsPerBlock; + if (ThreadLimit <= 0) { + MinThreadsPerBlock = 1; + MaxThreadsPerBlock = DeviceData[DeviceId].NumThreads; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Running kernel `%s` using from %d to %d threads\n", + KernelName, MinThreadsPerBlock, MaxThreadsPerBlock); + } + else { + MinThreadsPerBlock = MaxThreadsPerBlock = CudaThreadsPerBlock; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Running kernel `%s` using the requested number of threads (%d)\n", + KernelName, CudaThreadsPerBlock); + } + // The log file is assumed to be a CSV file using the following columns: + // - KernelName, + // - 1 column for each of the kernel features, + // - 1 column for each of the runtime parameters (currently only BlocksPerGrid and ThreadsPerBlock), + // - ExecutionMs + // Each row of the file represents a single run of the kernel which is identified both by a kernel + // and by runtime parameters + char* LogFileName = std::getenv("BENCHMARK_KERNEL_LOG"); + std::ofstream LogFileOutputStream; + std::ifstream LogFileInputStream; + if (LogFileName) { + LogFileOutputStream.open(LogFileName, std::ios_base::app); + LogFileInputStream.open(LogFileName); + if (LogFileInputStream.peek() == std::fstream::traits_type::eof()) { + LogFileOutputStream << "KernelName,BasicBlocks,BlocksReachedFromConditionals,DirectCallsToDefinedFunctions," + << "LoadInsts,MaxLoopDepth,StoreInsts,TopLevelLoops,Uses," + << "BlocksPerGrid,ThreadsPerBlock,ExecutionMs\n"; + LogFileOutputStream.flush(); + } + LogFileInputStream.close(); + } + for (int TrialThreadsPerBlock = MaxThreadsPerBlock; TrialThreadsPerBlock >= MinThreadsPerBlock; TrialThreadsPerBlock /= 2) { + for (int TrialBlocksPerGrid = MaxBlocksPerGrid; TrialBlocksPerGrid >= MinBlocksPerGrid; TrialBlocksPerGrid /= 2) { + // The measurement of the runtime is implemented via CUDA events; + // see also https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/ + CUevent StartEvent, StopEvent; + cuEventCreate(&StartEvent, CU_EVENT_DEFAULT); + cuEventCreate(&StopEvent, CU_EVENT_DEFAULT); + cuEventRecord(StartEvent, Stream); + CUresult trialErr = cuLaunchKernel(KernelInfo->Func, TrialBlocksPerGrid, /* gridDimY */ 1, + /* gridDimZ */ 1, TrialThreadsPerBlock, + /* blockDimY */ 1, /* blockDimZ */ 1, + /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); + if (Err == CUDA_SUCCESS) + Err = trialErr; + cuEventRecord(StopEvent, Stream); + cuEventSynchronize(StopEvent); + float ExecutionMs = 0; + cuEventElapsedTime(&ExecutionMs, StartEvent, StopEvent); + cuEventDestroy(StartEvent); + cuEventDestroy(StopEvent); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Kernel %s executed in %Ld ms with %d blocks and %d threads in %s " + "mode\n", + KernelName, + (long long)ExecutionMs, + TrialBlocksPerGrid, TrialThreadsPerBlock, + (KernelInfo->ExecutionMode != SPMD + ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic") + : "SPMD")); + if (LogFileOutputStream.is_open()) { + LogFileOutputStream << KernelName << "," + << KernelInfo->KernelFeatures.BasicBlockCount << "," + << KernelInfo->KernelFeatures.BlocksReachedFromConditionalInstruction << "," + << KernelInfo->KernelFeatures.DirectCallsToDefinedFunctions << "," + << KernelInfo->KernelFeatures.LoadInstCount << "," + << KernelInfo->KernelFeatures.MaxLoopDepth << "," + << KernelInfo->KernelFeatures.StoreInstCount << "," + << KernelInfo->KernelFeatures.TopLevelLoopCount << "," + << KernelInfo->KernelFeatures.Uses << "," + << TrialBlocksPerGrid << "," << TrialThreadsPerBlock << "," << ExecutionMs << "\n"; + LogFileOutputStream.flush(); + } + } + } + if (LogFileOutputStream.is_open()) { + LogFileOutputStream.close(); + } + } + else { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Launching kernel %s with %d blocks and %d threads in %s " + "mode\n", + (getOffloadEntry(DeviceId, TgtEntryPtr)) + ? getOffloadEntry(DeviceId, TgtEntryPtr)->name + : "(null)", + CudaBlocksPerGrid, CudaThreadsPerBlock, + (KernelInfo->ExecutionMode != SPMD + ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic") + : "SPMD")); + Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, + /* gridDimZ */ 1, CudaThreadsPerBlock, + /* blockDimY */ 1, /* blockDimZ */ 1, + /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); + } if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) return OFFLOAD_FAIL;