Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -81,6 +81,8 @@ OMPRTL_NVPTX__kmpc_end_reduce_nowait, /// Call to void __kmpc_data_sharing_init_stack(); OMPRTL_NVPTX__kmpc_data_sharing_init_stack, + /// Call to void __kmpc_data_sharing_init_stack_spmd(); + OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd, /// Call to void* __kmpc_data_sharing_push_stack(size_t size, /// int16_t UseSharedMemory); OMPRTL_NVPTX__kmpc_data_sharing_push_stack, @@ -1025,6 +1027,12 @@ /*RequiresDataSharing=*/Bld.getInt16(1)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); + + // For data sharing, we need to initialize the stack. + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); + CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); @@ -1107,11 +1115,6 @@ // Wait for parallel work syncCTAThreads(CGF); - // For data sharing, we need to initialize the stack for workers. - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); - Address WorkFn = CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); Address ExecStatus = @@ -1417,6 +1420,13 @@ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); break; } + case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: { + /// Build void __kmpc_data_sharing_init_stack_spmd(); + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); + break; + } case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: { // Build void *__kmpc_data_sharing_push_stack(size_t size, // int16_t UseSharedMemory); Index: test/OpenMP/nvptx_data_sharing.cpp =================================================================== --- test/OpenMP/nvptx_data_sharing.cpp +++ test/OpenMP/nvptx_data_sharing.cpp @@ -30,7 +30,6 @@ /// ========= In the worker function ========= /// // CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker() // CK1: call void @llvm.nvvm.barrier0() -// CK1: call void @__kmpc_data_sharing_init_stack /// ========= In the kernel function ========= ///