Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -81,6 +81,8 @@
   OMPRTL_NVPTX__kmpc_end_reduce_nowait,
   /// Call to void __kmpc_data_sharing_init_stack();
   OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
+  /// Call to void __kmpc_data_sharing_init_stack_spmd();
+  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
   /// Call to void* __kmpc_data_sharing_push_stack(size_t size,
   /// int16_t UseSharedMemory);
   OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
@@ -1025,6 +1027,12 @@
                          /*RequiresDataSharing=*/Bld.getInt16(1)};
   CGF.EmitRuntimeCall(
       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
+
+  // For data sharing, we need to initialize the stack.
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(
+          OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
+
   CGF.EmitBranch(ExecuteBB);
 
   CGF.EmitBlock(ExecuteBB);
@@ -1107,11 +1115,6 @@
   // Wait for parallel work
   syncCTAThreads(CGF);
 
-  // For data sharing, we need to initialize the stack for workers.
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(
-          OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
-
   Address WorkFn =
       CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
   Address ExecStatus =
@@ -1417,6 +1420,13 @@
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
     break;
   }
+  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
+    /// Build void __kmpc_data_sharing_init_stack_spmd();
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
+    break;
+  }
   case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
     // Build void *__kmpc_data_sharing_push_stack(size_t size,
     // int16_t UseSharedMemory);
Index: test/OpenMP/nvptx_data_sharing.cpp
===================================================================
--- test/OpenMP/nvptx_data_sharing.cpp
+++ test/OpenMP/nvptx_data_sharing.cpp
@@ -30,7 +30,6 @@
 /// ========= In the worker function ========= ///
 // CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker()
 // CK1: call void @llvm.nvvm.barrier0()
-// CK1: call void @__kmpc_data_sharing_init_stack
 
 /// ========= In the kernel function ========= ///
 
Index: test/OpenMP/nvptx_data_sharing_spmd.cpp
===================================================================
--- /dev/null
+++ test/OpenMP/nvptx_data_sharing_spmd.cpp
@@ -0,0 +1,24 @@
+// Test device global memory data sharing initialization codegen for spmd.
+///==========================================================================///
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void test_ds(){
+  #pragma omp target teams distribute parallel for
+  for (int i = 0; i < 100; i++ ) {
+    int a = 100;
+  }
+}
+
+// CK1: {{.*}}define weak void @__omp_offloading{{.*}}test_ds{{.*}}()
+// CK1: call void @__kmpc_spmd_kernel_init
+// CK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd
+
+#endif
+