diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h copy from clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h copy to clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -1,4 +1,4 @@ -//===----- CGOpenMPRuntimeNVPTX.h - Interface to OpenMP NVPTX Runtimes ----===// +//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// // -// This provides a class for OpenMP runtime code generation specialized to NVPTX -// targets from generalized CGOpenMPRuntimeGPU class. +// This provides a class for OpenMP runtime code generation specialized to +// AMDGCN targets from generalized CGOpenMPRuntimeGPU class. // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H -#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H +#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H +#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H #include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeGPU.h" @@ -22,14 +22,16 @@ namespace clang { namespace CodeGen { -class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU { +class CGOpenMPRuntimeAMDGCN : public CGOpenMPRuntimeGPU { public: - explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM); llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; + llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; + llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; }; -} // CodeGen namespace. -} // clang namespace. +} // namespace CodeGen +} // namespace clang -#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H +#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -0,0 +1,64 @@ +//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for OpenMP runtime code generation specialized to +// AMDGCN targets from generalized CGOpenMPRuntimeGPU class. +// +//===----------------------------------------------------------------------===// + +#include "CGOpenMPRuntimeAMDGCN.h" +#include "CGOpenMPRuntimeGPU.h" +#include "CodeGenFunction.h" +#include "clang/AST/Attr.h" +#include "clang/AST/DeclOpenMP.h" +#include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtVisitor.h" +#include "clang/Basic/Cuda.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace clang; +using namespace CodeGen; +using namespace llvm::omp; + +CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM) + : CGOpenMPRuntimeGPU(CGM) { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("OpenMP AMDGCN can only handle device code."); +} + +/// Get the GPU warp size. +llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + // return constant compile-time target-specific warp size + unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + return Bld.getInt32(WarpSize); +} + +/// Get the id of the current thread on the GPU. +llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *F = + CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x); + return Bld.CreateCall(F, llvm::None, "nvptx_tid"); +} + +/// Get the maximum number of threads in a block of the GPU. +llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Module *M = &CGF.CGM.getModule(); + const char *LocSize = "__ockl_get_local_size"; + llvm::Function *F = M->getFunction(LocSize); + if (!F) { + F = llvm::Function::Create( + llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false), + llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); + } + return Bld.CreateTrunc( + Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This provides a generalized class for OpenMP runtime code generation -// specialized by GPU target NVPTX. +// specialized by GPU targets NVPTX and AMDGCN. // //===----------------------------------------------------------------------===// @@ -199,8 +199,11 @@ void clear() override; /// Declare generalized virtual functions which need to be defined - /// by all specializations of OpenMPGPURuntime Targets. + /// by all specializations of OpenMPGPURuntime Targets like AMDGCN + /// and NVPTX. virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0; + virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0; + virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0; /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This provides a generalized class for OpenMP runtime code generation -// specialized by GPU target NVPTX. +// specialized by GPU targets NVPTX and AMDGCN. // //===----------------------------------------------------------------------===// @@ -621,14 +621,6 @@ }; } // anonymous namespace -/// Get the id of the current thread on the GPU. -static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { - return CGF.EmitRuntimeCall( - llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), - "nvptx_tid"); -} - /// Get the id of the warp in the block. /// We assume that the warp size is 32, which is always the case /// on the NVPTX device, to generate more efficient code. @@ -636,7 +628,8 @@ CGBuilderTy &Bld = CGF.Builder; unsigned LaneIDBits = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); - return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id"); + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); } /// Get the id of the current lane in the Warp. @@ -646,18 +639,11 @@ CGBuilderTy &Bld = CGF.Builder; unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( llvm::omp::GV_Warp_Size_Log2_Mask); - return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask), + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), "nvptx_lane_id"); } -/// Get the maximum number of threads in a block of the GPU. -static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { - return CGF.EmitRuntimeCall( - llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), - "nvptx_num_threads"); -} - /// Get the value of the thread_limit clause in the teams directive. /// For the 'generic' execution mode, the runtime encodes thread_limit in /// the launch parameters, always starting thread_limit+warpSize threads per @@ -668,9 +654,9 @@ CGBuilderTy &Bld = CGF.Builder; auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); return IsInSPMDExecutionMode - ? getNVPTXNumThreads(CGF) - : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF), - "thread_limit"); + ? RT.getGPUNumThreads(CGF) + : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF), + RT.getGPUWarpSize(CGF), "thread_limit"); } /// Get the thread id of the OMP master thread. @@ -682,8 +668,8 @@ /// If NumThreads is 1024, master id is 992. static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; - llvm::Value *NumThreads = getNVPTXNumThreads(CGF); auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + llvm::Value *NumThreads = RT.getGPUNumThreads(CGF); // We assume that the warp size is a power of 2. llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); @@ -1235,8 +1221,9 @@ llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); EST.ExitBB = CGF.createBasicBlock(".exit"); + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); llvm::Value *IsWorker = - Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); + Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF)); Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); CGF.EmitBlock(WorkerBB); @@ -1245,7 +1232,7 @@ CGF.EmitBlock(MasterCheckBB); llvm::Value *IsMaster = - Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); + Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF)); Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); @@ -2780,14 +2767,16 @@ llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + // Get the mask of active threads in the warp. llvm::Value *Mask = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask)); // Fetch team-local id of the thread. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); + llvm::Value *ThreadID = RT.getGPUThreadID(CGF); // Get the width of the team. - llvm::Value *TeamWidth = getNVPTXNumThreads(CGF); + llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); // Initialize the counter variable for the loop. QualType Int32Ty = @@ -3250,8 +3239,9 @@ CGM.addCompilerUsedGlobal(TransferMedium); } + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); // Get the CUDA thread id of the current OpenMP thread on the GPU. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); + llvm::Value *ThreadID = RT.getGPUThreadID(CGF); // nvptx_lane_id = nvptx_id % warpsize llvm::Value *LaneID = getNVPTXLaneID(CGF); // nvptx_warp_id = nvptx_id / warpsize @@ -4844,9 +4834,11 @@ CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { ScheduleKind = OMPC_DIST_SCHEDULE_static; - Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF), + Chunk = CGF.EmitScalarConversion( + RT.getGPUNumThreads(CGF), CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), S.getIterationVariable()->getType(), S.getBeginLoc()); return; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -27,6 +27,8 @@ public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; + llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; + llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -39,3 +39,21 @@ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), "nvptx_warp_size"); } + +/// Get the id of the current thread on the GPU. +llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *F; + F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x); + return Bld.CreateCall(F, llvm::None, "nvptx_tid"); +} + +/// Get the maximum number of threads in a block of the GPU. +llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *F; + F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x); + return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); +} diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -62,6 +62,7 @@ CGObjCRuntime.cpp CGOpenCLRuntime.cpp CGOpenMPRuntime.cpp + CGOpenMPRuntimeAMDGCN.cpp CGOpenMPRuntimeGPU.cpp CGOpenMPRuntimeNVPTX.cpp CGRecordLayoutBuilder.cpp diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -19,6 +19,7 @@ #include "CGObjCRuntime.h" #include "CGOpenCLRuntime.h" #include "CGOpenMPRuntime.h" +#include "CGOpenMPRuntimeAMDGCN.h" #include "CGOpenMPRuntimeNVPTX.h" #include "CodeGenFunction.h" #include "CodeGenPGO.h" @@ -215,6 +216,11 @@ "OpenMP NVPTX is only prepared to deal with device code."); OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this)); break; + case llvm::Triple::amdgcn: + assert(getLangOpts().OpenMPIsDevice && + "OpenMP AMDGCN is only prepared to deal with device code."); + OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this)); + break; default: if (LangOpts.OpenMPSimd) OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this)); diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp @@ -0,0 +1,45 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +#define N 1000 + +int test_amdgcn_target_tid_threads() { + // CHECK-LABEL: test_amdgcn_target_tid_threads + // CHECK-LABEL: entry: + + int arr[N]; + +// CHECK: %nvptx_tid = call i32 @llvm.amdgcn.workitem.id.x() +// CHECK-DAG: %nvptx_num_threads = call i64 @__ockl_get_local_size(i32 0) +// CHECK-DAG: [[VAR1:%[0-9]+]] = trunc i64 %nvptx_num_threads to i32 +// CHECK-DAG: %thread_limit = sub nuw i32 [[VAR1]], 64 +#pragma omp target + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + +int test_amdgcn_target_tid_threads_simd() { + // CHECK-LABEL: test_amdgcn_target_tid_threads_simd + // CHECK-LABEL: entry: + + int arr[N]; + +// CHECK: %nvptx_num_threads = call i64 @__ockl_get_local_size(i32 0) +// CHECK-DAG: [[VAR2:%[0-9]+]] = trunc i64 %nvptx_num_threads to i32 +// CHECK-DAG: call void @__kmpc_spmd_kernel_init(i32 [[VAR2]], i16 0, i16 0) +#pragma omp target simd + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + return arr[0]; +} + +#endif