Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -851,10 +851,10 @@ /// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 /// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code /// for num_teams clause. - /// \param NumTeams An integer value of teams. - /// \param ThreadLimit An integer value of threads. - virtual void emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *NumTeams, - llvm::Value *ThreadLimit, SourceLocation Loc); + /// \param NumTeams An integer expression of teams. + /// \param ThreadLimit An integer expression of threads. + virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc); }; Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -4789,17 +4789,25 @@ } void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF, - llvm::Value *NumTeams, - llvm::Value *ThreadLimit, + const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) return; auto *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *NumTeamsVal = (NumTeams) ? CGF.Builder.CreateIntCast( + CGF.EmitScalarExpr(NumTeams), CGF.CGM.Int32Ty, /* isSigned = */ true) : + CGF.Builder.getInt32(0); + + llvm::Value *ThreadLimitVal = (ThreadLimit) ? CGF.Builder.CreateIntCast( + CGF.EmitScalarExpr(ThreadLimit), CGF.CGM.Int32Ty, /* isSigned = */ true) : + CGF.Builder.getInt32(0); + // Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit) llvm::Value *PushNumTeamsArgs[] = { - RTLoc, getThreadID(CGF, Loc), NumTeams, ThreadLimit}; + RTLoc, getThreadID(CGF, Loc), NumTeamsVal, ThreadLimitVal}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams), PushNumTeamsArgs); } Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -23,6 +23,41 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + + /// \brief This function ought to emit, in the general case, a call to + // the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed + // as these numbers are obtained through the PTX grid and block configuration. + /// \param NumTeams An integer expression of teams. + /// \param ThreadLimit An integer expression of threads. + virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, + const Expr *ThreadLimit, SourceLocation Loc); + + /// \brief Emits inlined function for the specified OpenMP parallel + // directive but an inlined function for teams. + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). + /// \param D OpenMP directive. + /// \param ThreadIDVar Variable for thread id in the current OpenMP region. + /// \param InnermostKind Kind of innermost directive (for simple directives it + /// is a directive itself, for combined - its innermost directive). + /// \param CodeGen Code generation sequence for the \a D directive. + virtual llvm::Value *emitParallelOrTeamsOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen); + + /// \brief Emits code for teams call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run by team masters. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// + virtual void emitTeamsCall(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + SourceLocation Loc, llvm::Value *OutlinedFn, + ArrayRef CapturedVars); + }; } // CodeGen namespace. Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -14,9 +14,44 @@ #include "CGOpenMPRuntimeNVPTX.h" #include "clang/AST/DeclOpenMP.h" +#include "CodeGenFunction.h" +#include "clang/AST/StmtOpenMP.h" using namespace clang; using namespace CodeGen; CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) : CGOpenMPRuntime(CGM) {} + +void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, + const Expr *NumTeams, + const Expr *ThreadLimit, + SourceLocation Loc) {} + +llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + + llvm::Function *OutlinedFun = nullptr; + if (isa(D)) { + // no outlining happening for teams + } else + llvm_unreachable("parallel directive is not yet supported for nvptx " + "backend."); + + return OutlinedFun; +} + +void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + SourceLocation Loc, + llvm::Value *OutlinedFn, + ArrayRef CapturedVars) { + + // just emit the statements in the teams region inlined + auto &&CodeGen = [&D](CodeGenFunction &CGF) { + CGF.EmitStmt(cast(D.getAssociatedStmt())->getCapturedStmt()); + }; + + emitInlinedDirective(CGF, OMPD_teams, CodeGen); +} Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -2933,18 +2933,11 @@ const OMPNumTeamsClause *NT = TD.getSingleClause(); const OMPThreadLimitClause *TL = TD.getSingleClause(); if (NT || TL) { - llvm::Value *NumTeamsVal = (NT) ? CGF.Builder.CreateIntCast( - CGF.EmitScalarExpr(NT->getNumTeams()), CGF.CGM.Int32Ty, - /* isSigned = */ true) : - CGF.Builder.getInt32(0); - - llvm::Value *ThreadLimitVal = (TL) ? CGF.Builder.CreateIntCast( - CGF.EmitScalarExpr(TL->getThreadLimit()), CGF.CGM.Int32Ty, - /* isSigned = */ true) : - CGF.Builder.getInt32(0); - - CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeamsVal, - ThreadLimitVal, S.getLocStart()); + Expr *NumTeams = (NT) ? NT->getNumTeams() : nullptr; + Expr *ThreadLimit = (TL) ? TL->getThreadLimit() : nullptr; + + CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeams, ThreadLimit, + S.getLocStart()); } CGF.CGM.getOpenMPRuntime().emitTeamsCall(CGF, S, S.getLocStart(), OutlinedFn, Index: test/OpenMP/nvptx_teams_codegen.cpp =================================================================== --- /dev/null +++ test/OpenMP/nvptx_teams_codegen.cpp @@ -0,0 +1,116 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -omptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -omptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +#ifdef CK1 + +template +int tmain(T argc) { +#pragma omp target +#pragma omp teams + argc = 0; + return 0; +} + +int main (int argc, char **argv) { +#pragma omp target +#pragma omp teams + argc = 0; + return tmain(argv); +} + +// only nvptx side: do not outline teams region and do not call fork_teams +// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}} [[ARGC:%.+]]) +// CK1: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, +// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], +// CK1-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* +// CK1-64: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[CONV]], +// CK1-32: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR]], +// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK1: ret void +// CK1-NEXT: } + +// target region in template +// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}***{{.+}} [[ARGC:%.+]]) +// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}***, +// CK1: store i{{.+}}*** [[ARGC]], i{{.+}}**** [[ARGCADDR]] +// CK1: [[ARGCADDR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR]], +// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_REF]], +// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK1: ret void +// CK1-NEXT: } + + +#endif // CK1 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -omptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -omptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 +// expected-no-diagnostics +#ifdef CK2 + +template +int tmain(T argc) { + int a = 10; + int b = 5; +#pragma omp target +#pragma omp teams num_teams(a) thread_limit(b) + { + argc = 0; + } + return 0; +} + +int main (int argc, char **argv) { + int a = 20; + int b = 5; +#pragma omp target +#pragma omp teams num_teams(a) thread_limit(b) + { + argc = 0; + } + return tmain(argv); +} + +// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]]) +// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}, +// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}, +// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, +// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num( +// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], +// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], +// CK2: store i{{[0-9]+}} [[ARGC_IN]], i{{[0-9]+}}* [[ARGCADDR]], +// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* +// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* +// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* +// CK2-64: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[CONV]], +// CK2-32: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR]], +// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( +// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK2: ret + +// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}}*{{.+}} [[A_IN:%.+]], i{{[0-9]+}}*{{.+}} [[BP:%.+]], i{{[0-9]+}}***{{.+}} [[ARGC:%.+]]) +// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}*, +// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}*, +// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}***, +// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num( +// CK2: store i{{[0-9]+}}* [[A_IN]], i{{[0-9]+}}** [[AADDR]], +// CK2: store i{{[0-9]+}}* [[B_IN]], i{{[0-9]+}}** [[BADDR]], +// CK2: store i{{[0-9]+}}*** [[ARGC]], i{{[0-9]+}}**** [[ARGCADDR]], +// CK2: [[A_ADDR_VAL:%.+]] = load i32*, i32** [[AADDR]] +// CK2: [[B_ADDR_VAL:%.+]] = load i32*, i32** [[BADDR]] +// CK2: [[ARGC_ADDR_VAL:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR]] +// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGC_ADDR_VAL]], +// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( +// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK2: ret void + +#endif // CK2 +#endif