diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -307,7 +307,17 @@ } }; -static RTLDeviceInfoTy DeviceInfo; +static RTLDeviceInfoTy *DeviceInfo; + +__attribute__((constructor(101))) void init() { + DP("Init CUDA plugin!\n"); + DeviceInfo = new RTLDeviceInfoTy(); +} + +__attribute__((destructor(101))) void deinit() { + DP("Deinit CUDA plugin!\n"); + delete DeviceInfo; +} #ifdef __cplusplus extern "C" { @@ -317,11 +327,11 @@ return elf_check_machine(image, 190); // EM_CUDA = 190. } -int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } +int32_t __tgt_rtl_number_of_devices() { return DeviceInfo->NumberOfDevices; } int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { DP("Init requires flags to %ld\n", RequiresFlags); - DeviceInfo.RequiresFlags = RequiresFlags; + DeviceInfo->RequiresFlags = RequiresFlags; return RequiresFlags; } @@ -337,7 +347,7 @@ } // Create the context and save it to use whenever this device is selected. - err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, + err = cuCtxCreate(&DeviceInfo->Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, cuDevice); if (err != CUDA_SUCCESS) { DP("Error when creating a CUDA context\n"); @@ -345,13 +355,13 @@ return OFFLOAD_FAIL; } - err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + err = cuCtxSetCurrent(DeviceInfo->Contexts[device_id]); if (err != CUDA_SUCCESS) { DP("Error when setting current CUDA context\n"); CUDA_ERR_STRING(err); } - for (CUstream &Stream : DeviceInfo.Streams[device_id]) { + for (CUstream &Stream : DeviceInfo->Streams[device_id]) { err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); if (err != CUDA_SUCCESS) { DP("Error when creating CUDA stream\n"); @@ -365,12 +375,12 @@ cuDevice); if (err != CUDA_SUCCESS) { DP("Error getting max grid dimension, use default\n"); - DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DeviceInfo->BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) { - DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX; + DeviceInfo->BlocksPerGrid[device_id] = maxGridDimX; DP("Using %d CUDA blocks per grid\n", maxGridDimX); } else { - DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; + DeviceInfo->BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " "at the hard limit\n", maxGridDimX, RTLDeviceInfoTy::HardTeamLimit); @@ -382,12 +392,12 @@ cuDevice); if (err != CUDA_SUCCESS) { DP("Error getting max block dimension, use default\n"); - DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + DeviceInfo->ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) { - DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX; + DeviceInfo->ThreadsPerBlock[device_id] = maxBlockDimX; DP("Using %d CUDA threads per block\n", maxBlockDimX); } else { - DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; + DeviceInfo->ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping" "at the hard limit\n", maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit); @@ -398,48 +408,48 @@ cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); if (err != CUDA_SUCCESS) { DP("Error getting warp size, assume default\n"); - DeviceInfo.WarpSize[device_id] = 32; + DeviceInfo->WarpSize[device_id] = 32; } else { - DeviceInfo.WarpSize[device_id] = warpSize; + DeviceInfo->WarpSize[device_id] = warpSize; } // Adjust teams to the env variables - if (DeviceInfo.EnvTeamLimit > 0 && - DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { - DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; + if (DeviceInfo->EnvTeamLimit > 0 && + DeviceInfo->BlocksPerGrid[device_id] > DeviceInfo->EnvTeamLimit) { + DeviceInfo->BlocksPerGrid[device_id] = DeviceInfo->EnvTeamLimit; DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", - DeviceInfo.EnvTeamLimit); + DeviceInfo->EnvTeamLimit); } DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", - DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], - DeviceInfo.WarpSize[device_id]); + DeviceInfo->BlocksPerGrid[device_id], + DeviceInfo->ThreadsPerBlock[device_id], DeviceInfo->WarpSize[device_id]); // Set default number of teams - if (DeviceInfo.EnvNumTeams > 0) { - DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; + if (DeviceInfo->EnvNumTeams > 0) { + DeviceInfo->NumTeams[device_id] = DeviceInfo->EnvNumTeams; DP("Default number of teams set according to environment %d\n", - DeviceInfo.EnvNumTeams); + DeviceInfo->EnvNumTeams); } else { - DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DeviceInfo->NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; DP("Default number of teams set according to library's default %d\n", RTLDeviceInfoTy::DefaultNumTeams); } - if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { - DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; + if (DeviceInfo->NumTeams[device_id] > DeviceInfo->BlocksPerGrid[device_id]) { + DeviceInfo->NumTeams[device_id] = DeviceInfo->BlocksPerGrid[device_id]; DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceInfo.BlocksPerGrid[device_id]); + DeviceInfo->BlocksPerGrid[device_id]); } // Set default number of threads - DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + DeviceInfo->NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; DP("Default number of threads set according to library's default %d\n", RTLDeviceInfoTy::DefaultNumThreads); - if (DeviceInfo.NumThreads[device_id] > - DeviceInfo.ThreadsPerBlock[device_id]) { - DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; + if (DeviceInfo->NumThreads[device_id] > + DeviceInfo->ThreadsPerBlock[device_id]) { + DeviceInfo->NumTeams[device_id] = DeviceInfo->ThreadsPerBlock[device_id]; DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceInfo.ThreadsPerBlock[device_id]); + DeviceInfo->ThreadsPerBlock[device_id]); } return OFFLOAD_SUCCESS; @@ -449,7 +459,7 @@ __tgt_device_image *image) { // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + CUresult err = cuCtxSetCurrent(DeviceInfo->Contexts[device_id]); if (err != CUDA_SUCCESS) { DP("Error when setting a CUDA context for device %d\n", device_id); CUDA_ERR_STRING(err); @@ -457,7 +467,7 @@ } // Clear the offload table as we are going to create a new one. - DeviceInfo.clearOffloadEntriesTable(device_id); + DeviceInfo->clearOffloadEntriesTable(device_id); // Create the module and extract the function pointers. @@ -471,7 +481,7 @@ } DP("CUDA module successfully loaded!\n"); - DeviceInfo.Modules.push_back(cumod); + DeviceInfo->Modules.push_back(cumod); // Find the symbols in the module by name. __tgt_offload_entry *HostBegin = image->EntriesBegin; @@ -521,7 +531,7 @@ // (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && // (e->flags & OMP_DECLARE_TARGET_LINK || // e->flags == OMP_DECLARE_TARGET_TO)) - if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { + if (DeviceInfo->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { // If unified memory is present any target link or to variables // can access host addresses directly. There is no longer a // need for device copies. @@ -531,7 +541,7 @@ DPxPTR(*((void**)e->addr)), DPxPTR(cuptr)); } - DeviceInfo.addOffloadEntry(device_id, entry); + DeviceInfo->addOffloadEntry(device_id, entry); continue; } @@ -589,7 +599,7 @@ __tgt_offload_entry entry = *e; entry.addr = (void *)&KernelsList.back(); - DeviceInfo.addOffloadEntry(device_id, entry); + DeviceInfo->addOffloadEntry(device_id, entry); } // send device environment data to the device @@ -634,7 +644,7 @@ } } - return DeviceInfo.getOffloadEntriesTable(device_id); + return DeviceInfo->getOffloadEntriesTable(device_id); } void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { @@ -643,7 +653,7 @@ } // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + CUresult err = cuCtxSetCurrent(DeviceInfo->Contexts[device_id]); if (err != CUDA_SUCCESS) { DP("Error while trying to set CUDA current context\n"); CUDA_ERR_STRING(err); @@ -665,14 +675,14 @@ int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size) { // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + CUresult err = cuCtxSetCurrent(DeviceInfo->Contexts[device_id]); if (err != CUDA_SUCCESS) { DP("Error when setting CUDA context\n"); CUDA_ERR_STRING(err); return OFFLOAD_FAIL; } - CUstream &Stream = DeviceInfo.getNextStream(device_id); + CUstream &Stream = DeviceInfo->getNextStream(device_id); err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream); if (err != CUDA_SUCCESS) { @@ -698,14 +708,14 @@ int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, int64_t size) { // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + CUresult err = cuCtxSetCurrent(DeviceInfo->Contexts[device_id]); if (err != CUDA_SUCCESS) { DP("Error when setting CUDA context\n"); CUDA_ERR_STRING(err); return OFFLOAD_FAIL; } - CUstream &Stream = DeviceInfo.getNextStream(device_id); + CUstream &Stream = DeviceInfo->getNextStream(device_id); err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream); if (err != CUDA_SUCCESS) { @@ -730,7 +740,7 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + CUresult err = cuCtxSetCurrent(DeviceInfo->Contexts[device_id]); if (err != CUDA_SUCCESS) { DP("Error when setting CUDA context\n"); CUDA_ERR_STRING(err); @@ -750,7 +760,7 @@ void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, int32_t thread_limit, uint64_t loop_tripcount) { // Set the context we are using. - CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + CUresult err = cuCtxSetCurrent(DeviceInfo->Contexts[device_id]); if (err != CUDA_SUCCESS) { DP("Error when setting CUDA context\n"); CUDA_ERR_STRING(err); @@ -775,19 +785,19 @@ DP("Setting CUDA threads per block to requested %d\n", thread_limit); // Add master warp if necessary if (KernelInfo->ExecutionMode == GENERIC) { - cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; - DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); + cudaThreadsPerBlock += DeviceInfo->WarpSize[device_id]; + DP("Adding master warp: +%d threads\n", DeviceInfo->WarpSize[device_id]); } } else { - cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; + cudaThreadsPerBlock = DeviceInfo->NumThreads[device_id]; DP("Setting CUDA threads per block to default %d\n", - DeviceInfo.NumThreads[device_id]); + DeviceInfo->NumThreads[device_id]); } - if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { - cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; + if (cudaThreadsPerBlock > DeviceInfo->ThreadsPerBlock[device_id]) { + cudaThreadsPerBlock = DeviceInfo->ThreadsPerBlock[device_id]; DP("Threads per block capped at device limit %d\n", - DeviceInfo.ThreadsPerBlock[device_id]); + DeviceInfo->ThreadsPerBlock[device_id]); } int kernel_limit; @@ -802,7 +812,7 @@ int cudaBlocksPerGrid; if (team_num <= 0) { - if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { + if (loop_tripcount > 0 && DeviceInfo->EnvNumTeams < 0) { if (KernelInfo->ExecutionMode == SPMD) { // We have a combined construct, i.e. `target teams distribute parallel // for [simd]`. We launch so many teams so that each thread will @@ -828,13 +838,13 @@ "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, cudaThreadsPerBlock); } else { - cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; - DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); + cudaBlocksPerGrid = DeviceInfo->NumTeams[device_id]; + DP("Using default number of teams %d\n", DeviceInfo->NumTeams[device_id]); } - } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { - cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; + } else if (team_num > DeviceInfo->BlocksPerGrid[device_id]) { + cudaBlocksPerGrid = DeviceInfo->BlocksPerGrid[device_id]; DP("Capping number of teams to team limit %d\n", - DeviceInfo.BlocksPerGrid[device_id]); + DeviceInfo->BlocksPerGrid[device_id]); } else { cudaBlocksPerGrid = team_num; DP("Using requested number of teams %d\n", team_num); @@ -844,7 +854,7 @@ DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, cudaThreadsPerBlock); - CUstream &Stream = DeviceInfo.getNextStream(device_id); + CUstream &Stream = DeviceInfo->getNextStream(device_id); err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -127,7 +127,17 @@ } }; -static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); +static RTLDeviceInfoTy *DeviceInfo; + +__attribute__((constructor(101))) void init() { + DP("Init generic plugin!\n"); + DeviceInfo = new RTLDeviceInfoTy(NUMBER_OF_DEVICES); +} + +__attribute__((destructor(101))) void deinit() { + DP("Deinit generic plugin!\n"); + delete DeviceInfo; +} #ifdef __cplusplus extern "C" { @@ -239,7 +249,7 @@ return NULL; } - DeviceInfo.DynLibs.push_back(Lib); + DeviceInfo->DynLibs.push_back(Lib); struct link_map *libInfo = (struct link_map *)Lib.Handle; @@ -264,11 +274,11 @@ DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", DPxPTR(entries_begin), DPxPTR(entries_end)); - DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + DeviceInfo->createOffloadTable(device_id, entries_begin, entries_end); elf_end(e); - return DeviceInfo.getOffloadEntriesTable(device_id); + return DeviceInfo->getOffloadEntriesTable(device_id); } void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {