Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -912,11 +912,10 @@ /// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 /// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code /// for num_teams clause. - /// \param NumTeams An integer value of teams. - /// \param ThreadLimit An integer value of threads. - virtual void emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *NumTeams, - llvm::Value *ThreadLimit, SourceLocation Loc); - + /// \param NumTeams An integer expression of teams. + /// \param ThreadLimit An integer expression of threads. + virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc); }; } // namespace CodeGen Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -4832,17 +4832,29 @@ } void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF, - llvm::Value *NumTeams, - llvm::Value *ThreadLimit, + const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; auto *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *NumTeamsVal = + (NumTeams) + ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams), + CGF.CGM.Int32Ty, /* isSigned = */ true) + : CGF.Builder.getInt32(0); + + llvm::Value *ThreadLimitVal = + (ThreadLimit) + ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit), + CGF.CGM.Int32Ty, /* isSigned = */ true) + : CGF.Builder.getInt32(0); + // Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit) - llvm::Value *PushNumTeamsArgs[] = { - RTLoc, getThreadID(CGF, Loc), NumTeams, ThreadLimit}; + llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal, + ThreadLimitVal}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams), PushNumTeamsArgs); } Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -136,6 +136,41 @@ public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + + /// \brief This function ought to emit, in the general case, a call to + // the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed + // as these numbers are obtained through the PTX grid and block configuration. + /// \param NumTeams An integer expression of teams. + /// \param ThreadLimit An integer expression of threads. + void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc) override; + + /// \brief Emits inlined function for the specified OpenMP parallel + // directive but an inlined function for teams. + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). + /// \param D OpenMP directive. + /// \param ThreadIDVar Variable for thread id in the current OpenMP region. + /// \param InnermostKind Kind of innermost directive (for simple directives it + /// is a directive itself, for combined - its innermost directive). + /// \param CodeGen Code generation sequence for the \a D directive. + llvm::Value * + emitParallelOrTeamsOutlinedFunction(const OMPExecutableDirective &D, + const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, + const RegionCodeGenTy &CodeGen) override; + + /// \brief Emits code for teams call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run by team masters. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// + void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, + SourceLocation Loc, llvm::Value *OutlinedFn, + ArrayRef CapturedVars) override; }; } // CodeGen namespace. Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -14,6 +14,8 @@ #include "CGOpenMPRuntimeNVPTX.h" #include "clang/AST/DeclOpenMP.h" +#include "CodeGenFunction.h" +#include "clang/AST/StmtOpenMP.h" using namespace clang; using namespace CodeGen; @@ -350,3 +352,45 @@ // Called once per module during initialization. initializeEnvironment(); } + +void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, + const Expr *NumTeams, + const Expr *ThreadLimit, + SourceLocation Loc) {} + +llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + + llvm::Function *OutlinedFun = nullptr; + if (isa(D)) { + llvm::Value *OutlinedFunVal = + CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen); + OutlinedFun = cast(OutlinedFunVal); + OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); + } else + llvm_unreachable("parallel directive is not yet supported for nvptx " + "backend."); + + return OutlinedFun; +} + +void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + SourceLocation Loc, + llvm::Value *OutlinedFn, + ArrayRef CapturedVars) { + if (!CGF.HaveInsertPoint()) + return; + + Address ZeroAddr = + CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), + /*Name*/ ".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); + llvm::SmallVector OutlinedFnArgs; + OutlinedFnArgs.push_back(ZeroAddr.getPointer()); + OutlinedFnArgs.push_back(ZeroAddr.getPointer()); + OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); + CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); +} Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -3098,18 +3098,11 @@ const OMPNumTeamsClause *NT = TD.getSingleClause(); const OMPThreadLimitClause *TL = TD.getSingleClause(); if (NT || TL) { - llvm::Value *NumTeamsVal = (NT) ? CGF.Builder.CreateIntCast( - CGF.EmitScalarExpr(NT->getNumTeams()), CGF.CGM.Int32Ty, - /* isSigned = */ true) : - CGF.Builder.getInt32(0); - - llvm::Value *ThreadLimitVal = (TL) ? CGF.Builder.CreateIntCast( - CGF.EmitScalarExpr(TL->getThreadLimit()), CGF.CGM.Int32Ty, - /* isSigned = */ true) : - CGF.Builder.getInt32(0); - - CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeamsVal, - ThreadLimitVal, S.getLocStart()); + Expr *NumTeams = (NT) ? NT->getNumTeams() : nullptr; + Expr *ThreadLimit = (TL) ? TL->getThreadLimit() : nullptr; + + CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeams, ThreadLimit, + S.getLocStart()); } OMPLexicalScope Scope(CGF, S); Index: test/OpenMP/nvptx_teams_codegen.cpp =================================================================== --- /dev/null +++ test/OpenMP/nvptx_teams_codegen.cpp @@ -0,0 +1,136 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +#ifdef CK1 + +template +int tmain(T argc) { +#pragma omp target +#pragma omp teams + argc = 0; + return 0; +} + + +int main (int argc, char **argv) { +#pragma omp target +#pragma omp teams + { + argc = 0; + } + return tmain(argv); +} + +// only nvptx side: do not outline teams region and do not call fork_teams +// CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]]) +// CK1: {{.+}} = alloca i{{[0-9]+}}*, +// CK1: {{.+}} = alloca i{{[0-9]+}}*, +// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*, +// CK1: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, +// CK1: store {{.+}} 0, {{.+}}, +// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], +// CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* +// CK1-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK1-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK1: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]], +// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK1: ret void +// CK1-NEXT: } + +// target region in template +// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}***{{.+}} [[ARGC:%.+]]) +// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***, +// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}***, +// CK1: store i{{.+}}*** [[ARGC]], i{{.+}}**** [[ARGCADDR]] +// CK1: [[ARGCADDR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR]], +// CK1: store i8*** [[ARGCADDR_REF]], i8**** [[ARGCADDR_PTR]], +// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR_PTR]], +// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]], +// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK1: ret void +// CK1-NEXT: } + + +#endif // CK1 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 +// expected-no-diagnostics +#ifdef CK2 + +template +int tmain(T argc) { + int a = 10; + int b = 5; +#pragma omp target +#pragma omp teams num_teams(a) thread_limit(b) + { + argc = 0; + } + return 0; +} + +int main (int argc, char **argv) { + int a = 20; + int b = 5; +#pragma omp target +#pragma omp teams num_teams(a) thread_limit(b) + { + argc = 0; + } + return tmain(argv); +} + +// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]]) +// CK2: {{.}} = alloca i{{[0-9]+}}*, +// CK2: {{.}} = alloca i{{[0-9]+}}*, +// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*, +// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}, +// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}, +// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, +// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num( +// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], +// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], +// CK2: store i{{[0-9]+}} [[ARGC_IN]], i{{[0-9]+}}* [[ARGCADDR]], +// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* +// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* +// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* +// CK2-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK2-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]], +// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( +// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK2: ret + +// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}}*{{.+}} [[A_IN:%.+]], i{{[0-9]+}}*{{.+}} [[BP:%.+]], i{{[0-9]+}}***{{.+}} [[ARGC:%.+]]) +// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}***, +// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}*, +// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}*, +// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}***, +// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num( +// CK2: store i{{[0-9]+}}* [[A_IN]], i{{[0-9]+}}** [[AADDR]], +// CK2: store i{{[0-9]+}}* [[B_IN]], i{{[0-9]+}}** [[BADDR]], +// CK2: store i{{[0-9]+}}*** [[ARGC]], i{{[0-9]+}}**** [[ARGCADDR]], +// CK2: [[A_ADDR_VAL:%.+]] = load i32*, i32** [[AADDR]] +// CK2: [[B_ADDR_VAL:%.+]] = load i32*, i32** [[BADDR]] +// CK2: [[ARGC_ADDR_VAL:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR]] +// CK2: store i{{[0-9]+}}*** [[ARGC_ADDR_VAL]], i{{[0-9]+}}**** [[ARGCADDR_PTR]], +// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]], +// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]], +// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( +// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK2: ret void + +#endif // CK2 +#endif