diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -274,9 +274,13 @@ // get_info fails iff HSA runtime not yet initialized hsa_status_t err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); - if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) - printf("rtl.cpp: err %d\n", err); - assert(err == HSA_STATUS_SUCCESS); + + if (err != HSA_STATUS_SUCCESS) { + if (print_kernel_trace > 0) + DP("rtl.cpp: err %s\n", get_error_string(err)); + + return err; + } CB(device_type, agent); return HSA_STATUS_SUCCESS; @@ -284,7 +288,7 @@ // iterate_agents fails iff HSA runtime not yet initialized if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) { - printf("rtl.cpp: err %d\n", err); + DP("rtl.cpp: err %s\n", get_error_string(err)); } return err; @@ -297,8 +301,8 @@ if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) { status_string = "unavailable"; } - fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, - __LINE__, source, status, status_string); + DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, source, + status, status_string); abort(); } } @@ -324,8 +328,8 @@ MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &AllocAllowed); if (err != HSA_STATUS_SUCCESS) { - fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", - get_error_string(err)); + DP("Alloc allowed in memory pool check failed: %s\n", + get_error_string(err)); return err; } @@ -338,7 +342,7 @@ err = hsa_amd_memory_pool_get_info( MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); if (err != HSA_STATUS_SUCCESS) { - fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err)); + DP("Get memory pool info failed: %s\n", get_error_string(err)); return err; } @@ -346,7 +350,7 @@ err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size); if (err != HSA_STATUS_SUCCESS) { - fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err)); + DP("Get memory pool size failed: %s\n", get_error_string(err)); return err; } @@ -366,8 +370,8 @@ MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &AllocAllowed); if (Err != HSA_STATUS_SUCCESS) { - fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", - get_error_string(Err)); + DP("Alloc allowed in memory pool check failed: %s\n", + get_error_string(Err)); return {Err, false}; } @@ -392,8 +396,8 @@ }); if (Err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Iterate all memory pools", get_error_string(Err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterate all memory pools", get_error_string(Err)); return Err; } } @@ -409,14 +413,14 @@ err = hsa_amd_agent_iterate_memory_pools( Agent, addKernArgPool, static_cast(&KernArgPools)); if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Iterate all memory pools", get_error_string(err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterate all memory pools", get_error_string(err)); return {err, hsa_amd_memory_pool_t{}}; } } if (KernArgPools.empty()) { - fprintf(stderr, "Unable to find any valid kernarg pool\n"); + DP("Unable to find any valid kernarg pool\n"); return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}}; } @@ -638,16 +642,15 @@ Err = core::collectMemoryPools( CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2)); if (Err != HSA_STATUS_SUCCESS) { - fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n", - get_error_string(Err)); + DP("HSA error in collecting memory pools for CPU: %s\n", + get_error_string(Err)); return Err; } Err = core::collectMemoryPools( HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2)); if (Err != HSA_STATUS_SUCCESS) { - fprintf(stderr, - "HSA error in collecting memory pools for offload devices: %s\n", - get_error_string(Err)); + DP("HSA error in collecting memory pools for offload devices: %s\n", + get_error_string(Err)); return Err; } return HSA_STATUS_SUCCESS; @@ -813,8 +816,8 @@ Err = hsa_shut_down(); if (Err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA", - get_error_string(Err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA", + get_error_string(Err)); } } }; @@ -999,9 +1002,9 @@ } if (print_kernel_trace & STARTUP_DETAILS) - fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id, - DeviceInfo.ComputeUnits[device_id], - DeviceInfo.GPUName[device_id].c_str()); + DP("Device#%-2d CU's: %2d %s\n", device_id, + DeviceInfo.ComputeUnits[device_id], + DeviceInfo.GPUName[device_id].c_str()); // Query attributes to determine number of threads/block and blocks/grid. uint16_t workgroup_max_dim[3]; @@ -1420,7 +1423,7 @@ hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4); if (rc != HSA_STATUS_SUCCESS) { - fprintf(stderr, "zero fill device_state failed with %u\n", rc); + DP("zero fill device_state failed with %u\n", rc); core::Runtime::Memfree(ptr); return HSA_STATUS_ERROR; } @@ -1489,11 +1492,17 @@ check("Module registering", err); if (err != HSA_STATUS_SUCCESS) { - fprintf(stderr, - "Possible gpu arch mismatch: device:%s, image:%s please check" - " compiler flag: -march=\n", - DeviceInfo.GPUName[device_id].c_str(), - get_elf_mach_gfx_name(elf_e_flags(image))); + const char *DeviceName = DeviceInfo.GPUName[device_id].c_str(); + const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image)); + + if (strcmp(DeviceName, ElfName) != 0) { + DP("Possible gpu arch mismatch: device:%s, image:%s please check" + " compiler flag: -march=\n", + DeviceName, ElfName); + } else { + DP("Error loading image onto GPU: %s\n", get_error_string(err)); + } + return NULL; } @@ -1585,8 +1594,8 @@ if (!e->addr) { // The host should have always something in the address to // uniquely identify the target region. - fprintf(stderr, "Analyzing host entry '' (size = %lld)...\n", - (unsigned long long)e->size); + DP("Analyzing host entry '' (size = %lld)...\n", + (unsigned long long)e->size); return NULL; } @@ -1886,18 +1895,15 @@ Max_Teams = RTLDeviceInfoTy::HardTeamLimit; if (print_kernel_trace & STARTUP_DETAILS) { - fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n", - RTLDeviceInfoTy::Max_Teams); - fprintf(stderr, "Max_Teams: %d\n", Max_Teams); - fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n", - RTLDeviceInfoTy::Warp_Size); - fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n", - RTLDeviceInfoTy::Max_WG_Size); - fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n", - RTLDeviceInfoTy::Default_WG_Size); - fprintf(stderr, "thread_limit: %d\n", thread_limit); - fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); - fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize); + DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams); + DP("Max_Teams: %d\n", Max_Teams); + DP("RTLDeviceInfoTy::Warp_Size: %d\n", RTLDeviceInfoTy::Warp_Size); + DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size); + DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", + RTLDeviceInfoTy::Default_WG_Size); + DP("thread_limit: %d\n", thread_limit); + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("ConstWGSize: %d\n", ConstWGSize); } // check for thread_limit() clause if (thread_limit > 0) { @@ -1919,7 +1925,7 @@ threadsPerGroup); } if (print_kernel_trace & STARTUP_DETAILS) - fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + DP("threadsPerGroup: %d\n", threadsPerGroup); DP("Preparing %d threads\n", threadsPerGroup); // Set default num_groups (teams) @@ -1930,8 +1936,8 @@ DP("Set default num of groups %d\n", num_groups); if (print_kernel_trace & STARTUP_DETAILS) { - fprintf(stderr, "num_groups: %d\n", num_groups); - fprintf(stderr, "num_teams: %d\n", num_teams); + DP("num_groups: %d\n", num_groups); + DP("num_teams: %d\n", num_teams); } // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size @@ -1950,9 +1956,9 @@ num_groups = (num_teams < num_groups) ? num_teams : num_groups; } if (print_kernel_trace & STARTUP_DETAILS) { - fprintf(stderr, "num_groups: %d\n", num_groups); - fprintf(stderr, "Env.NumTeams %d\n", Env.NumTeams); - fprintf(stderr, "Env.TeamLimit %d\n", Env.TeamLimit); + DP("num_groups: %d\n", num_groups); + DP("Env.NumTeams %d\n", Env.NumTeams); + DP("Env.TeamLimit %d\n", Env.TeamLimit); } if (Env.NumTeams > 0) { @@ -1984,14 +1990,13 @@ if (num_groups > Max_Teams) { num_groups = Max_Teams; if (print_kernel_trace & STARTUP_DETAILS) - fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups, - Max_Teams); + DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams); } if (num_groups > num_teams && num_teams > 0) { num_groups = num_teams; if (print_kernel_trace & STARTUP_DETAILS) - fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n", - num_groups, num_teams); + DP("Limiting num_groups %d to clause num_teams %d \n", num_groups, + num_teams); } } @@ -2003,9 +2008,9 @@ num_groups = Env.MaxTeamsDefault; } if (print_kernel_trace & STARTUP_DETAILS) { - fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); - fprintf(stderr, "num_groups: %d\n", num_groups); - fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount); + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("num_groups: %d\n", num_groups); + DP("loop_tripcount: %ld\n", loop_tripcount); } DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, threadsPerGroup);