diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -55,6 +55,7 @@ DIRECTORY) set(cuda_sources + ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip ${devicertl_base_directory}/common/src/cancel.cu ${devicertl_base_directory}/common/src/critical.cu diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip @@ -0,0 +1,66 @@ +//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "target_impl.h" + +// Partially derived fom hcc_detail/device_functions.h + +/* + HW_ID Register bit structure + WAVE_ID 3:0 Wave buffer slot number. 0-9. + SIMD_ID 5:4 SIMD which the wave is assigned to within the CU. + PIPE_ID 7:6 Pipeline from which the wave was dispatched. + CU_ID 11:8 Compute Unit the wave is assigned to. + SH_ID 12 Shader Array (within an SE) the wave is assigned to. + SE_ID 14:13 Shader Engine the wave is assigned to. + TG_ID 19:16 Thread-group ID + VM_ID 23:20 Virtual Memory ID + QUEUE_ID 26:24 Queue from which this wave was dispatched. + STATE_ID 29:27 State ID (graphics only, not compute). + ME_ID 31:30 Micro-engine ID. + */ + +#define HW_ID 4 + +#define HW_ID_CU_ID_SIZE 4 +#define HW_ID_CU_ID_OFFSET 8 + +#define HW_ID_SE_ID_SIZE 2 +#define HW_ID_SE_ID_OFFSET 13 + +/* + Encoding of parameter bitmask + HW_ID 5:0 HW_ID + OFFSET 10:6 Range: 0..31 + WIDTH 15:11 Range: 1..32 + */ + +/* + Note: The results can be changed by a context switch + Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper + bound on how many compute units are available. Some values in this + range may never be returned if there are fewer than 2^CU_ID_SIZE CUs. +*/ + +/* + The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width) + where hwreg forms a 16 bit immediate encoded by the assembler thus: + uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { + return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11); + } +*/ + +#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11)) + +DEVICE uint32_t __kmpc_impl_smid() { + uint32_t cu_id = __builtin_amdgcn_s_getreg( + ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); + uint32_t se_id = __builtin_amdgcn_s_getreg( + ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); + return (se_id << HW_ID_CU_ID_SIZE) + cu_id; +}