Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -372,7 +372,7 @@ /// \param InnermostKind Kind of innermost directive (for simple directives it /// is a directive itself, for combined - its innermost directive). /// \param CodeGen Code generation sequence for the \a D directive. - virtual llvm::Value *emitParallelOutlinedFunction( + virtual llvm::Value *emitParallelOrTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen); @@ -782,6 +782,28 @@ /// was emitted in the current module and return the function that registers /// it. virtual llvm::Function *emitRegistrationFunction(); + + /// \brief Emits code for teams call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run by team masters. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// + virtual void emitTeamsCall(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + SourceLocation Loc, llvm::Value *OutlinedFn, + ArrayRef CapturedVars); + + /// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 + /// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code + /// for num_teams clause. + /// \param NumTeams An integer value of teams. + /// \param ThreadsLimit An integer value of threads. + virtual void emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *NumTeams, + llvm::Value *ThreadLimit, SourceLocation Loc); + }; } // namespace CodeGen Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -473,6 +473,12 @@ // Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid, // kmp_int32 cncl_kind); OMPRTL__kmpc_cancel, + // Call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, + // kmp_int32 num_teams, kmp_int32 thread_limit); + OMPRTL__kmpc_push_num_teams, + /// \brief Call to void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, + /// kmpc_micro microtask, ...); + OMPRTL__kmpc_fork_teams, // // Offloading related calls @@ -561,7 +567,7 @@ return CGF.Builder.CreateStructGEP(Addr, Field, Offset, Name); } -llvm::Value *CGOpenMPRuntime::emitParallelOutlinedFunction( +llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { assert(ThreadIDVar->getType()->isPointerType() && @@ -1141,6 +1147,26 @@ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel"); break; } + case OMPRTL__kmpc_push_num_teams: { + // Build void kmpc_push_num_teams (ident_t loc, kmp_int32 global_tid, + // kmp_int32 num_teams, kmp_int32 num_threads) + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, + CGM.Int32Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_teams"); + break; + } + case OMPRTL__kmpc_fork_teams: { + // Build void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro + // microtask, ...); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, + getKmpc_MicroPointerTy()}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_teams"); + break; + } case OMPRTL__tgt_target: { // Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t @@ -4569,3 +4595,43 @@ // compilation unit. return createOffloadingBinaryDescriptorRegistration(); } + +void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + SourceLocation Loc, + llvm::Value *OutlinedFn, + ArrayRef CapturedVars) { + if (!CGF.HaveInsertPoint()) + return; + + auto *RTLoc = emitUpdateLocation(CGF, Loc); + CodeGenFunction::RunCleanupsScope Scope(CGF); + + // Build call __kmpc_fork_teams(loc, n, microtask, var1, .., varn); + llvm::Value *Args[] = { + RTLoc, + CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars + CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy())}; + llvm::SmallVector RealArgs; + RealArgs.append(std::begin(Args), std::end(Args)); + RealArgs.append(CapturedVars.begin(), CapturedVars.end()); + + auto RTLFn = createRuntimeFunction(OMPRTL__kmpc_fork_teams); + CGF.EmitRuntimeCall(RTLFn, RealArgs); +} + +void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF, + llvm::Value *NumTeams, + llvm::Value *ThreadLimit, + SourceLocation Loc) { + if (!CGF.HaveInsertPoint()) + return; + + auto *RTLoc = emitUpdateLocation(CGF, Loc); + + // Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit) + llvm::Value *PushNumTeamsArgs[] = { + RTLoc, getThreadID(CGF, Loc), NumTeams, ThreadLimit}; + CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams), + PushNumTeamsArgs); +} Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -969,8 +969,9 @@ auto CS = cast(S.getAssociatedStmt()); llvm::SmallVector CapturedVars; CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); - auto OutlinedFn = CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction( - S, *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen); + auto OutlinedFn = CGF.CGM.getOpenMPRuntime(). + emitParallelOrTeamsOutlinedFunction(S, + *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen); if (const auto *NumThreadsClause = S.getSingleClause()) { CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF); auto NumThreads = CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(), @@ -2686,12 +2687,50 @@ CapturedVars); } +static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF, + const OMPExecutableDirective &S, + OpenMPDirectiveKind InnermostKind, + const RegionCodeGenTy &CodeGen) { + auto CS = cast(S.getAssociatedStmt()); + llvm::SmallVector CapturedVars; + CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); + auto OutlinedFn = CGF.CGM.getOpenMPRuntime(). + emitParallelOrTeamsOutlinedFunction(S, + *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen); + + // TODO: deal with clauses other than num_teams and thread_limit + + const OMPTeamsDirective &TD = *dyn_cast(&S); + const OMPNumTeamsClause *NT = TD.getSingleClause(); + const OMPThreadLimitClause *TL = TD.getSingleClause(); + if (NT || TL) { + llvm::Value *NumTeamsVal = (NT) ? CGF.Builder.CreateIntCast( + CGF.EmitScalarExpr(NT->getNumTeams()), CGF.CGM.Int32Ty, + /* isSigned = */ true) : + CGF.Builder.getInt32(0); + + llvm::Value *ThreadLimitVal = (TL) ? CGF.Builder.CreateIntCast( + CGF.EmitScalarExpr(TL->getThreadLimit()), CGF.CGM.Int32Ty, + /* isSigned = */ true) : + CGF.Builder.getInt32(0); + + CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeamsVal, + ThreadLimitVal, S.getLocStart()); + } + + CGF.CGM.getOpenMPRuntime().emitTeamsCall(CGF, S, S.getLocStart(), OutlinedFn, + CapturedVars); +} + void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &S) { LexicalScope Scope(*this, S.getSourceRange()); - const CapturedStmt &CS = *cast(S.getAssociatedStmt()); - - // FIXME: We should fork teams here instead of just emit the statement. - EmitStmt(CS.getCapturedStmt()); + // Emit parallel region as a standalone region. + auto &&CodeGen = [&S](CodeGenFunction &CGF) { + OMPPrivateScope PrivateScope(CGF); + (void)PrivateScope.Privatize(); + CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); + }; + emitCommonOMPTeamsDirective(*this, S, OMPD_teams, CodeGen); } void CodeGenFunction::EmitOMPCancellationPointDirective( Index: test/OpenMP/teams_codegen.cpp =================================================================== --- test/OpenMP/teams_codegen.cpp +++ test/OpenMP/teams_codegen.cpp @@ -208,4 +208,139 @@ } #endif // CK3 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64 +// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64 +// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-32 +// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-32 + +#ifdef CK4 + +// CK4-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CK4-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CK4-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } +// CK4-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00" +// CK4-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00" + +template +int tmain(T argc) { +#pragma omp target +#pragma omp teams + argc = 0; + return 0; +} + +int main (int argc, char **argv) { +#pragma omp target +#pragma omp teams + argc = 0; + return tmain(argv); +} + +// CK4: define {{.*}}void @{{[^,]+}}(i{{.+}} %[[ARGC:.+]]) +// CK4: [[ARGCADDR:%.+]] = alloca i{{.+}} +// CK4: store i{{.+}} %[[ARGC]], i{{.+}}* [[ARGCADDR]] +// CK4-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* +// CK4-64: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* {{.+}} to void (i32*, i32*, ...)*), i32* [[CONV]]) +// CK4-32: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* {{.+}} to void (i32*, i32*, ...)*), i32* [[ARGCADDR]]) +// CK4: ret void +// CK4-NEXT: } + +// CK4: define {{.*}}void @{{[^,]+}}(i8*** dereferenceable({{.}}) [[ARGC1:%.+]]) +// CK4: [[ARGCADDR1:%.+]] = alloca i8*** +// CK4: store i8*** [[ARGC1]], i8**** [[ARGCADDR1]] +// CK4: [[CONV1:%.+]] = load i8***, i8**** [[ARGCADDR1]] +// CK4: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i8***)* {{.+}} to void (i32*, i32*, ...)*), i8*** [[CONV1]]) + + +#endif // CK4 + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64 +// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64 +// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-32 +// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-32 + +// expected-no-diagnostics +#ifdef CK5 + +// CK5-DAG: %ident_t = type { i32, i32, i32, i32, i8* } +// CK5-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00" +// CK5-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) } +// CK5-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00" +// CK5-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00" + +template +int tmain(T argc) { + int a = 10; + int b = 5; +#pragma omp target +#pragma omp teams num_teams(a) thread_limit(b) + { + argc = 0; + } + return 0; +} + +int main (int argc, char **argv) { + int a = 20; + int b = 5; +#pragma omp target +#pragma omp teams num_teams(a) thread_limit(b) + { + argc = 0; + } + return tmain(argv); +} + +// CK5: define {{.*}}void @{{[^,]+}}(i{{.+}} [[AP:%.+]], i{{.+}} [[BP:%.+]], i{{.+}} [[ARGC:.+]]) +// CK5: [[AADDR:%.+]] = alloca i{{.+}} +// CK5: [[BADDR:%.+]] = alloca i{{.+}} +// CK5: [[ARGCADDR:%.+]] = alloca i{{.+}} +// CK5: [[GBL_TH_NUM:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEF_LOC_0]]) +// CK5: store i{{.+}} [[AP]], i{{.+}}* [[AADDR]] +// CK5: store i{{.+}} [[BP]], i{{.+}}* [[BADDR]] +// CK5: store i{{.+}} [[ARGC]], i{{.+}}* [[ARGCADDR]] +// CK5-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* +// CK5-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* +// CK5-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* +// CK5-64: [[ACONVVAL:%.+]] = load i32, i32* [[ACONV]] +// CK5-64: [[BCONVVAL:%.+]] = load i32, i32* [[BCONV]] +// CK5-32: [[ACONVVAL:%.+]] = load i32, i32* [[AADDR]] +// CK5-32: [[BCONVVAL:%.+]] = load i32, i32* [[BADDR]] +// CK5: {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[ACONVVAL]], i32 [[BCONVVAL]]) +// CK5-64: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[CONV]]) +// CK5-32: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[ARGCADDR]]) + +// CK5: define {{.*}}void @{{[^,]+}}(i{{.+}} dereferenceable({{.+}}) [[AP:%.+]], i{{.+}} dereferenceable({{.+}}) [[BP:%.+]], i{{.+}} dereferenceable({{.+}}) [[ARGC:%.+]]) +// CK5: [[AADDR:%.+]] = alloca i{{.+}} +// CK5: [[BADDR:%.+]] = alloca i{{.+}} +// CK5: [[ARGCADDR:%.+]] = alloca i{{.+}} +// CK5: [[GBL_TH_NUM:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEF_LOC_0]]) +// CK5: store i{{.+}} [[AP]], i{{.+}}* [[AADDR]] +// CK5: store i{{.+}} [[BP]], i{{.+}}* [[BADDR]] +// CK5: store i{{.+}} [[ARGC]], i{{.+}}* [[ARGCADDR]] +// CK5: [[A_ADDR_VAL:%.+]] = load i32*, i32** [[AADDR]] +// CK5: [[B_ADDR_VAL:%.+]] = load i32*, i32** [[BADDR]] +// CK5: [[ARGC_ADDR_VAL:%.+]] = load i{{.+}}, i{{.+}}* [[ARGCADDR]] +// CK5: [[A_VAL:%.+]] = load i32, i32* [[A_ADDR_VAL]] +// CK5: [[B_VAL:%.+]] = load i32, i32* [[B_ADDR_VAL]] +// CK5: {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[A_VAL]], i32 [[B_VAL]]) +// CK5: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i{{.+}})* @.omp_outlined.{{.+}} to void (i32*, i32*, ...)*), i{{.+}} [[ARGC_ADDR_VAL]]) +// CK5: ret void +// CK5-NEXT: } + +#endif // CK5 + + #endif +