Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -69,14 +69,23 @@ /// Signal termination of OMP execution for non-SPMD target entry /// function. - void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST, + WorkerFunctionState &WST); + + /// Helper to emit a runtime check whether the current thread is already + /// participating in a parallel region. + std::pair + emitInParallelRuntimeCheck(CodeGenFunction &CGF, SourceLocation Loc, + const RegionCodeGenTy &InParallelGen, + const RegionCodeGenTy &MasterGen); /// Helper for generic variables globalization prolog. void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc, - bool WithSPMDCheck = false); + bool WithRuntimeCheck = false); /// Helper for generic variables globalization epilog. - void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false); + void emitGenericVarsEpilog(CodeGenFunction &CGF, SourceLocation Loc, + bool WithRuntimeCheck = false); /// Helper for SPMD mode target directive's entry function. void emitSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, @@ -385,7 +394,6 @@ llvm::SmallVector EscapedVariableLengthDeclsAddrs; const RecordDecl *GlobalRecord = nullptr; llvm::Value *GlobalRecordAddr = nullptr; - llvm::Value *IsInSPMDModeFlag = nullptr; std::unique_ptr MappedParams; }; /// Maps the function to the list of the globalized variables with their Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -1088,7 +1088,7 @@ } void Exit(CodeGenFunction &CGF) override { static_cast(CGF.CGM.getOpenMPRuntime()) - .emitNonSPMDEntryFooter(CGF, EST); + .emitNonSPMDEntryFooter(CGF, EST, WST); } } Action(EST, WST); CodeGen.setAction(Action); @@ -1147,12 +1147,13 @@ } void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { + EntryFunctionState &EST, + WorkerFunctionState &WST) { IsInTargetMasterThreadRegion = false; if (!CGF.HaveInsertPoint()) return; - emitGenericVarsEpilog(CGF); + emitGenericVarsEpilog(CGF, WST.Loc); if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -1876,7 +1877,7 @@ } void Exit(CodeGenFunction &CGF) override { static_cast(CGF.CGM.getOpenMPRuntime()) - .emitGenericVarsEpilog(CGF); + .emitGenericVarsEpilog(CGF, Loc); } } Action(Loc, GlobalizedRD, MappedDeclsFields); CodeGen.setAction(Action); @@ -1890,15 +1891,58 @@ return OutlinedFun; } +std::pair +CGOpenMPRuntimeNVPTX::emitInParallelRuntimeCheck( + CodeGenFunction &CGF, SourceLocation Loc, + const RegionCodeGenTy &InParallelGen, const RegionCodeGenTy &MasterGen) { + // Check for SPMD mode and then parallelism: + // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) { + // Already in parallel. + // } else { + // Code for master thread. + // (Worker threads have forked off to the generated worker function.) + // } + CGBuilderTy &Bld = CGF.Builder; + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); + llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck"); + llvm::BasicBlock *InParallelBB = CGF.createBasicBlock(".in-parallel"); + llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); + llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); + Bld.CreateCondBr(IsSPMD, InParallelBB, ParallelCheckBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(ParallelCheckBB); + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *ThreadID = getThreadID(CGF, Loc); + llvm::Value *PL = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), + {RTLoc, ThreadID}); + llvm::Value *Res = Bld.CreateIsNotNull(PL); + Bld.CreateCondBr(Res, InParallelBB, MasterBB); + CGF.EmitBlock(InParallelBB); + InParallelGen(CGF); + CGF.EmitBranch(ExitBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(MasterBB); + MasterGen(CGF); + CGF.EmitBranch(ExitBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + // Emit the continuation block for code after the if. + CGF.EmitBlock(ExitBB, /*IsFinished=*/true); + + return {InParallelBB, MasterBB}; +} + void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc, - bool WithSPMDCheck) { + bool WithRuntimeCheck) { if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic && getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) return; - CGBuilderTy &Bld = CGF.Builder; - const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) return; @@ -1909,62 +1953,50 @@ // handle the specifics of the allocation of the memory. // Use actual memory size of the record including the padding // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - - llvm::Value *GlobalRecCastAddr; - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(SPMDBB); - Address RecPtr = CGF.CreateMemTemp(RecTy, "_local_stack"); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); + llvm::Value *GlobalRecValue, *GlobalRecCastAddr; + auto &&DataSharingGen = [this, &RecTy, &GlobalRecValue, &GlobalRecCastAddr]( + CodeGenFunction &CGF, PrePostActionTy &) { + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RecTy).getQuantity(); + unsigned GlobalRecordSize = + CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); + GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); + // TODO: allow the usage of shared memory to be controlled by // the user, for now, default to global. llvm::Value *GlobalRecordSizeArg[] = { llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = + GlobalRecValue = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_data_sharing_push_stack), GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecCastAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); - CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecCastAddr->getType(), + }; + if (WithRuntimeCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + Address RecPtr = Address::invalid(); + auto &&LocalGen = [&RecPtr, &RecTy](CodeGenFunction &CGF, + PrePostActionTy &) { + RecPtr = CGF.CreateMemTemp(RecTy, "_local_stack"); + }; + + std::pair Blocks = + emitInParallelRuntimeCheck(CGF, Loc, LocalGen, DataSharingGen); + + auto *Phi = + CGF.Builder.CreatePHI(GlobalRecCastAddr->getType(), /*NumReservedValues=*/2, "_select_stack"); - Phi->addIncoming(RecPtr.getPointer(), SPMDBB); - Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); + Phi->addIncoming(RecPtr.getPointer(), Blocks.first); + Phi->addIncoming(GlobalRecCastAddr, Blocks.second); + GlobalRecCastAddr = Phi; I->getSecond().GlobalRecordAddr = Phi; - I->getSecond().IsInSPMDModeFlag = IsSPMD; } else { - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + RegionCodeGenTy DataSharingRCG(DataSharingGen); + DataSharingRCG(CGF); I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; } LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy); @@ -2024,7 +2056,8 @@ } void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, - bool WithSPMDCheck) { + SourceLocation Loc, + bool WithRuntimeCheck) { if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic && getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) return; @@ -2040,25 +2073,22 @@ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), Addr); } - if (I->getSecond().GlobalRecordAddr) { - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), - CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); - CGF.EmitBlock(ExitBB); - } else { + if (llvm::Value *GlobalRecordAddr = I->getSecond().GlobalRecordAddr) { + auto &&DataSharingGen = [this, &GlobalRecordAddr](CodeGenFunction &CGF, + PrePostActionTy &) { CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); + CGF.EmitCastToVoidPtr(GlobalRecordAddr)); + }; + if (WithRuntimeCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + auto &&LocalGen = [](CodeGenFunction &CGF, PrePostActionTy &) { + // Nothing to do. + }; + emitInParallelRuntimeCheck(CGF, Loc, LocalGen, DataSharingGen); + } else { + RegionCodeGenTy DataSharingRCG(DataSharingGen); + DataSharingRCG(CGF); } } } @@ -2209,48 +2239,12 @@ auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen]( CodeGenFunction &CGF, PrePostActionTy &Action) { - if (IsInParallelRegion) { + if (IsInParallelRegion) SeqGen(CGF, Action); - } else if (IsInTargetMasterThreadRegion) { + else if (IsInTargetMasterThreadRegion) L0ParallelGen(CGF, Action); - } else { - // Check for master and then parallelism: - // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) { - // Serialized execution. - // } else { - // Worker call. - // } - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential"); - llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(ParallelCheckBB); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), - {RTLoc, ThreadID}); - llvm::Value *Res = Bld.CreateIsNotNull(PL); - Bld.CreateCondBr(Res, SeqBB, MasterBB); - CGF.EmitBlock(SeqBB); - SeqGen(CGF, Action); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(MasterBB); - L0ParallelGen(CGF, Action); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - // Emit the continuation block for code after the if. - CGF.EmitBlock(ExitBB, /*IsFinished=*/true); - } + else + emitInParallelRuntimeCheck(CGF, Loc, SeqGen, L0ParallelGen); }; if (IfCond) { @@ -4018,16 +4012,18 @@ Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid()))); } if (!NeedToDelayGlobalization) { - emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); + emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithRuntimeCheck=*/true); struct GlobalizationScope final : EHScopeStack::Cleanup { - GlobalizationScope() = default; + SourceLocation Loc; + GlobalizationScope(SourceLocation Loc) : Loc(Loc) {} void Emit(CodeGenFunction &CGF, Flags flags) override { static_cast(CGF.CGM.getOpenMPRuntime()) - .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true); + .emitGenericVarsEpilog(CGF, Loc, /*WithRuntimeCheck=*/true); } }; - CGF.EHStack.pushCleanup(NormalAndEHCleanup); + CGF.EHStack.pushCleanup(NormalAndEHCleanup, + D->getBeginLoc()); } } Index: test/OpenMP/declare_target_codegen_globalization.cpp =================================================================== --- test/OpenMP/declare_target_codegen_globalization.cpp +++ test/OpenMP/declare_target_codegen_globalization.cpp @@ -35,10 +35,14 @@ // CHECK-NOT: @__kmpc_data_sharing_push_stack // CHECK: define {{.*}}[[BAR]]() +// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* // CHECK: [[STACK:%.+]] = alloca [[GLOBAL_ST:%.+]], // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() // CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 // CHECK: br i1 [[IS_SPMD]], label +// CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @{{.+}}, i32 [[GTID]]) +// CHECK: icmp ne i16 [[RES]], 0 +// CHECK: br i1 // CHECK: br label // CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0) // CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST]]* @@ -46,7 +50,13 @@ // CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ [[STACK]], {{.+}} ], [ [[GLOBALS]], {{.+}} ] // CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call {{.*}}[[FOO]](i32* dereferenceable{{.*}} [[A_ADDR]]) +// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() +// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 // CHECK: br i1 [[IS_SPMD]], label +// CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @{{.+}}, i32 [[GTID]]) +// CHECK: icmp ne i16 [[RES]], 0 +// CHECK: br i1 +// CHECK: br label // CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) // CHECK: br label Index: test/OpenMP/nvptx_target_codegen.cpp =================================================================== --- test/OpenMP/nvptx_target_codegen.cpp +++ test/OpenMP/nvptx_target_codegen.cpp @@ -554,13 +554,18 @@ // CHECK: ret void // CHECK: define i32 [[BAZ]](i32 [[F:%.*]], double* dereferenceable{{.*}}) + // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* // CHECK: [[STACK:%.+]] = alloca [[GLOBAL_ST:%.+]], // CHECK: [[ZERO_ADDR:%.+]] = alloca i32, - // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* // CHECK: store i32 0, i32* [[ZERO_ADDR]] // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() // CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 // CHECK: br i1 [[IS_SPMD]], label + + // CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @{{.+}}, i32 [[GTID]]) + // CHECK: icmp ne i16 [[RES]], 0 + // CHECK: br i1 + // CHECK: br label // CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0) // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST]]* @@ -595,7 +600,14 @@ // CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]], // CHECK: store i32 [[RES]], i32* [[RET:%.+]], - // CHECK: br i1 [[IS_SPMD]], label + // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() + // CHECK: icmp ne i8 [[RES]], 0 + // CHECK: br i1 + + // CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @{{.+}}, i32 [[GTID]]) + // CHECK: icmp ne i16 [[RES]], 0 + // CHECK: br i1 + // CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) // CHECK: br label