diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -229,6 +229,7 @@ llvm::SmallDenseMap MappedDeclsFields; bool AllEscaped = false; bool IsForCombinedParallelRegion = false; + bool IsInSPMDKernel = false; void markAsEscaped(const ValueDecl *VD) { // Do not globalize declare target variables. @@ -242,6 +243,9 @@ // Variables captured by value must be globalized. if (auto *CSI = CGF.CapturedStmtInfo) { if (const FieldDecl *FD = CSI->lookup(cast(VD))) { + // Do not globalize captured vars in SPMD mode. + if (IsInSPMDKernel) + return; // Check if need to capture the variable that was already captured by // value in the outer region. if (!IsForCombinedParallelRegion) { @@ -351,9 +355,10 @@ public: CheckVarsEscapingDeclContext(CodeGenFunction &CGF, - ArrayRef TeamsReductions) - : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { - } + ArrayRef TeamsReductions, + bool IsInSPMDKernel = false) + : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()), + IsInSPMDKernel(IsInSPMDKernel) {} virtual ~CheckVarsEscapingDeclContext() = default; void VisitDeclStmt(const DeclStmt *S) { if (!S) @@ -1631,65 +1636,30 @@ OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { SourceLocation Loc = D.getBeginLoc(); - const RecordDecl *GlobalizedRD = nullptr; - llvm::SmallVector LastPrivatesReductions; - llvm::SmallDenseMap MappedDeclsFields; - unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); - // Globalize team reductions variable unconditionally in all modes. - if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) - getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); - if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { - getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); - if (!LastPrivatesReductions.empty()) { - GlobalizedRD = ::buildRecordForGlobalizedVars( - CGM.getContext(), llvm::None, LastPrivatesReductions, - MappedDeclsFields, WarpSize); - } - } else if (!LastPrivatesReductions.empty()) { - assert(!TeamAndReductions.first && - "Previous team declaration is not expected."); - TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); - std::swap(TeamAndReductions.second, LastPrivatesReductions); - } + assert(!TeamAndReductions.first && + "Previous team declaration is not expected."); + if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) + getDistributeLastprivateVars(CGM.getContext(), D, TeamAndReductions.second); + else + getTeamsReductionVars(CGM.getContext(), D, TeamAndReductions.second); + TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { SourceLocation &Loc; - const RecordDecl *GlobalizedRD; - llvm::SmallDenseMap - &MappedDeclsFields; public: - NVPTXPrePostActionTy( - SourceLocation &Loc, const RecordDecl *GlobalizedRD, - llvm::SmallDenseMap - &MappedDeclsFields) - : Loc(Loc), GlobalizedRD(GlobalizedRD), - MappedDeclsFields(MappedDeclsFields) {} + NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {} void Enter(CodeGenFunction &CGF) override { auto &Rt = static_cast(CGF.CGM.getOpenMPRuntime()); - if (GlobalizedRD) { - auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().GlobalRecord = GlobalizedRD; - I->getSecond().MappedParams = - std::make_unique(); - DeclToAddrMapTy &Data = I->getSecond().LocalVarData; - for (const auto &Pair : MappedDeclsFields) { - assert(Pair.getFirst()->isCanonicalDecl() && - "Expected canonical declaration"); - Data.insert(std::make_pair(Pair.getFirst(), - MappedVarData(Pair.getSecond(), - /*IsOnePerTeam=*/true))); - } - } Rt.emitGenericVarsProlog(CGF, Loc); } void Exit(CodeGenFunction &CGF) override { static_cast(CGF.CGM.getOpenMPRuntime()) .emitGenericVarsEpilog(CGF); } - } Action(Loc, GlobalizedRD, MappedDeclsFields); + } Action(Loc); CodeGen.setAction(Action); llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); @@ -4310,7 +4280,8 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) + if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && + getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) return; assert(D && "Expected function or captured|block decl."); @@ -4327,13 +4298,16 @@ } else if (const auto *CD = dyn_cast(D)) { Body = CD->getBody(); NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; - if (NeedToDelayGlobalization && + if (NeedToDelayGlobalization && !IsInTTDRegion && getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) return; } if (!Body) return; - CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); + bool IsInSPMDKernel = NeedToDelayGlobalization && + getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD; + CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second, + IsInSPMDKernel); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord(IsInTTDRegion); @@ -4358,6 +4332,8 @@ const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); } + if (IsInSPMDKernel) + return; if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -28,16 +28,19 @@ int main(int argc, char **argv) { int b[10], c[10], d[10]; #pragma omp target teams map(tofrom:a) + { + double escaped = 0; #pragma omp distribute parallel for firstprivate(b) lastprivate(c) if(a) for (int i= 0; i < argc; ++i) - a = foo(&i) + foo(&a) + foo(&b[i]) + foo(&c[i]) + foo(&d[i]); + a = foo(&i) + foo(&a) + foo(&b[i]) + foo(&c[i]) + foo(&d[i]) + escaped; + } return 0; } // SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } // SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef // SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40 +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 48 // SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 // CHECK-DAG: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0 @@ -47,9 +50,10 @@ // SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 -// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1) +// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 48, i16 1) // CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty* -// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK-DAG: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1 +// CHECK-DAG: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 // CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], // CHECK: call void @__kmpc_for_static_init_4(