diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -32,9 +32,6 @@ /// Get the id of the current thread on the GPU. llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; - - /// Get the maximum number of threads in a block of the GPU. - llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -46,16 +46,3 @@ CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x); return Bld.CreateCall(F, llvm::None, "nvptx_tid"); } - -llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Module *M = &CGF.CGM.getModule(); - const char *LocSize = "__kmpc_amdgcn_gpu_num_threads"; - llvm::Function *F = M->getFunction(LocSize); - if (!F) { - F = llvm::Function::Create( - llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false), - llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); - } - return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); -} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -182,7 +182,7 @@ virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0; /// Get the maximum number of threads in a block of the GPU. - virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0; + llvm::Value *getGPUNumThreads(CodeGenFunction &CGF); /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -3947,3 +3947,16 @@ } CGOpenMPRuntime::clear(); } + +llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Module *M = &CGF.CGM.getModule(); + const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; + llvm::Function *F = M->getFunction(LocSize); + if (!F) { + F = llvm::Function::Create( + llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false), + llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); + } + return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -32,9 +32,6 @@ /// Get the id of the current thread on the GPU. llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; - - /// Get the maximum number of threads in a block of the GPU. - llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -46,11 +46,3 @@ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x); return Bld.CreateCall(F, llvm::None, "nvptx_tid"); } - -llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Function *F; - F = llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x); - return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); -} diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -203,6 +203,9 @@ /// External interface to get the thread ID. uint32_t __kmpc_get_hardware_thread_id_in_block(); +/// External interface to get the number of threads. +uint32_t __kmpc_get_hardware_num_threads_in_block(); + /// Kernel /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -84,8 +84,9 @@ } uint32_t getNumberOfProcessorElements() { - // TODO - return mapping::getBlockSize(); + return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), + __builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); } uint32_t getWarpId() { @@ -230,5 +231,9 @@ __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() { return mapping::getThreadIdInBlock(); } + +__attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { + return mapping::getNumberOfProcessorElements(); +} } #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -22,6 +22,7 @@ /// Helper to keep code alive without introducing a performance penalty. __attribute__((used, weak, optnone)) void keepAlive() { __kmpc_get_hardware_thread_id_in_block(); + __kmpc_get_hardware_num_threads_in_block(); __kmpc_barrier_simple_spmd(nullptr, 0); } } // namespace _OMP