diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt index 2917cd4204b3..aac7ddc28adc 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -1,151 +1,152 @@ ##===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # ##===----------------------------------------------------------------------===## # # Build the AMDGCN Device RTL if the ROCM tools are available # ##===----------------------------------------------------------------------===## find_package(LLVM QUIET CONFIG PATHS $ENV{AOMP} $ENV{HOME}/rocm/aomp /opt/rocm/aomp /usr/lib/rocm/aomp ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR} ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR} ${CMAKE_CXX_COMPILER_DIR} NO_DEFAULT_PATH) if (LLVM_DIR) libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake") else() libomptarget_say("Not building AMDGCN device RTL: AOMP not found") return() endif() set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX}) if (AOMP_INSTALL_PREFIX) set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin) else() set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin) endif() libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}") project(omptarget-amdgcn) add_custom_target(omptarget-amdgcn ALL) #optimization level set(optimization_level 2) # Activate RTL message dumps if requested by the user. if(LIBOMPTARGET_NVPTX_DEBUG) set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1) endif() get_filename_component(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) set(cuda_sources + ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip ${devicertl_base_directory}/common/src/cancel.cu ${devicertl_base_directory}/common/src/critical.cu ${devicertl_base_directory}/common/src/data_sharing.cu ${devicertl_base_directory}/common/src/libcall.cu ${devicertl_base_directory}/common/src/loop.cu ${devicertl_base_directory}/common/src/omp_data.cu ${devicertl_base_directory}/common/src/omptarget.cu ${devicertl_base_directory}/common/src/parallel.cu ${devicertl_base_directory}/common/src/reduction.cu ${devicertl_base_directory}/common/src/support.cu ${devicertl_base_directory}/common/src/sync.cu ${devicertl_base_directory}/common/src/task.cu) set(h_files ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h ${devicertl_base_directory}/common/debug.h ${devicertl_base_directory}/common/device_environment.h ${devicertl_base_directory}/common/omptarget.h ${devicertl_base_directory}/common/omptargeti.h ${devicertl_base_directory}/common/state-queue.h ${devicertl_base_directory}/common/target_atomic.h ${devicertl_base_directory}/common/state-queuei.h ${devicertl_base_directory}/common/support.h) # for both in-tree and out-of-tree build if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY) set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR}) else() set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}) endif() # create libraries set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900) if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST) set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) endif() macro(add_cuda_bc_library) set(cu_cmd ${AOMP_BINDIR}/clang++ -std=c++11 -fcuda-rdc -fvisibility=default --cuda-device-only -Wno-unused-value -x hip -O${optimization_level} --cuda-gpu-arch=${mcpu} ${CUDA_DEBUG} -I${CMAKE_CURRENT_SOURCE_DIR}/src -I${devicertl_base_directory}) set(bc1_files) foreach(file ${ARGN}) get_filename_component(fname ${file} NAME_WE) set(bc1_filename ${fname}.${mcpu}.bc) add_custom_command( OUTPUT ${bc1_filename} COMMAND ${cu_cmd} ${file} -o ${bc1_filename} DEPENDS ${file} ${h_files}) list(APPEND bc1_files ${bc1_filename}) endforeach() add_custom_command( OUTPUT linkout.cuda.${mcpu}.bc COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc DEPENDS ${bc1_files}) list(APPEND bc_files linkout.cuda.${mcpu}.bc) endmacro() set(libname "omptarget-amdgcn") foreach(mcpu ${mcpus}) set(bc_files) add_cuda_bc_library(${cuda_sources}) set(bc_libname lib${libname}-${mcpu}.bc) add_custom_command( OUTPUT ${bc_libname} COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname} DEPENDS ${bc_files}) add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname}) install(FILES ${OUTPUTDIR}/${bc_libname} DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice" ) endforeach() diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip new file mode 100644 index 000000000000..74d0d167137f --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip @@ -0,0 +1,61 @@ +//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "target_impl.h" + +// Partially derived fom hcc_detail/device_functions.h + +// HW_ID Register bit structure +// WAVE_ID 3:0 Wave buffer slot number. 0-9. +// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU. +// PIPE_ID 7:6 Pipeline from which the wave was dispatched. +// CU_ID 11:8 Compute Unit the wave is assigned to. +// SH_ID 12 Shader Array (within an SE) the wave is assigned to. +// SE_ID 14:13 Shader Engine the wave is assigned to. +// TG_ID 19:16 Thread-group ID +// VM_ID 23:20 Virtual Memory ID +// QUEUE_ID 26:24 Queue from which this wave was dispatched. +// STATE_ID 29:27 State ID (graphics only, not compute). +// ME_ID 31:30 Micro-engine ID. + +enum { + HW_ID = 4, // specify that the hardware register to read is HW_ID + + HW_ID_CU_ID_SIZE = 4, // size of CU_ID field in bits + HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register + + HW_ID_SE_ID_SIZE = 2, // sizeof SE_ID field in bits + HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register +}; + +// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit +// immediate and returns a 32 bit value. +// The encoding of the immediate parameter is: +// ID 5:0 Which register to read from +// OFFSET 10:6 Range: 0..31 +// WIDTH 15:11 Range: 1..32 + +// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width) +// where hwreg forms a 16 bit immediate encoded by the assembler thus: +// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { +// return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11); +// } +#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11)) + +// Note: The results can be changed by a context switch +// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper +// bound on how many compute units are available. Some values in this +// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs. + +DEVICE uint32_t __kmpc_impl_smid() { + uint32_t cu_id = __builtin_amdgcn_s_getreg( + ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); + uint32_t se_id = __builtin_amdgcn_s_getreg( + ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); + return (se_id << HW_ID_CU_ID_SIZE) + cu_id; +}