diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -618,9 +618,13 @@
     CaptureRegions.push_back(OMPD_teams);
     break;
   case OMPD_target:
+    CaptureRegions.push_back(OMPD_task);
+    CaptureRegions.push_back(OMPD_target);
+    break;
   case OMPD_target_simd:
     CaptureRegions.push_back(OMPD_task);
     CaptureRegions.push_back(OMPD_target);
+    CaptureRegions.push_back(OMPD_simd);
     break;
   case OMPD_teams_distribute_parallel_for:
   case OMPD_teams_distribute_parallel_for_simd:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -967,6 +967,18 @@
       const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
       OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen);
 
+  /// Emits outlined function for the specified OpenMP simd directive
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  virtual llvm::Function *emitSIMDOutlinedFunction(
+      const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+      OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen);
+
   /// Emits outlined function for the specified OpenMP teams directive
   /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
   /// kmp_int32 BoundID, struct context_vars*).
@@ -1019,6 +1031,19 @@
                                 ArrayRef<llvm::Value *> CapturedVars,
                                 const Expr *IfCond);
 
+  /// Emits code for simd call of the \p Outlined Fn with variables captured in
+  /// a record whose address is stored in \p CapturedVars.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  virtual void emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc,
+                            llvm::Function *OutlinedFn,
+                            ArrayRef<llvm::Value *> CapturedVars,
+                            const Expr *IfCond);
+
   /// Emits a critical region.
   /// \param CriticalName Name of the critical region.
   /// \param CriticalOpGen Generator for the statement associated with the given
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1295,6 +1295,14 @@
       CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(), CodeGen);
 }
 
+llvm::Function *CGOpenMPRuntime::emitSIMDOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+  const CapturedStmt *CS = D.getCapturedStmt(OMPD_simd);
+  return emitParallelOrTeamsOutlinedFunction(
+      CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(), CodeGen);
+}
+
 llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
@@ -2143,6 +2151,11 @@
   }
 }
 
+void CGOpenMPRuntime::emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                   llvm::Function *OutlinedFn,
+                                   ArrayRef<llvm::Value *> CapturedVars,
+                                   const Expr *IfCond) {}
+
 // If we're inside an (outlined) parallel region, use the region info's
 // thread-ID variable (it is passed in a first argument of the outlined function
 // as "kmp_int32 *gtid"). Otherwise, if we're not inside parallel region, but in
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -262,6 +262,19 @@
                         ArrayRef<llvm::Value *> CapturedVars,
                         const Expr *IfCond) override;
 
+  /// Emits code for simd call of the \p Outlined Fn with variables captured in
+  /// a record whose address is stored in \p CapturedVars.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  void emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc,
+                    llvm::Function *OutlinedFn,
+                    ArrayRef<llvm::Value *> CapturedVars,
+                    const Expr *IfCond) override;
+
   /// Emit an implicit/explicit barrier for OpenMP threads.
   /// \param Kind Directive for which this implicit barrier call must be
   /// generated. Must be OMPD_barrier for explicit barrier generation.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1587,6 +1587,64 @@
   RCG(CGF);
 }
 
+void CGOpenMPRuntimeGPU::emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                      llvm::Function *OutlinedFn,
+                                      ArrayRef<llvm::Value *> CapturedVars,
+                                      const Expr *IfCond) {
+  auto &&SIMDGen = [this, Loc, OutlinedFn, CapturedVars,
+                    IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    CGBuilderTy &Bld = CGF.Builder;
+
+    Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
+        llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
+        "captured_vars_addrs");
+    // There's something to share.
+    if (!CapturedVars.empty()) {
+      // Prepare for parallel region. Indicate the outlined function.
+      ASTContext &Ctx = CGF.getContext();
+      unsigned Idx = 0;
+      for (llvm::Value *V : CapturedVars) {
+        Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
+        llvm::Value *PtrV;
+        if (V->getType()->isIntegerTy())
+          PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
+        else
+          PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
+        CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
+                              Ctx.getPointerType(Ctx.VoidPtrTy));
+        ++Idx;
+      }
+    }
+
+    llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+    llvm::Value *IfCondVal = nullptr;
+    if (IfCond)
+      IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
+                                    /* isSigned */ false);
+    else
+      IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
+
+    llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
+
+    llvm::Value *Args[] = {
+        RTLoc,
+        getThreadID(CGF, Loc),
+        IfCondVal,
+        llvm::ConstantInt::get(CGF.Int32Ty, -1),
+        llvm::ConstantInt::get(CGF.Int32Ty, -1),
+        FnPtr,
+        llvm::ConstantPointerNull::get(CGM.Int8PtrTy),
+        Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
+                                   CGF.VoidPtrPtrTy),
+        llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
+    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), OMPRTL___kmpc_simd_51),
+                        Args);
+  };
+  RegionCodeGenTy RCG(SIMDGen);
+  RCG(CGF);
+}
+
 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
   // Always emit simple barriers!
   if (!CGF.HaveInsertPoint())
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -2487,6 +2487,48 @@
   }
 }
 
+static void emitOMPTargetSimdRegion(CodeGenFunction &CGF,
+                                    const OMPLoopDirective &S,
+                                    PrePostActionTy &Action) {
+  assert(isOpenMPSimdDirective(S.getDirectiveKind()) &&
+         "Expected simd directive");
+
+  Action.Enter(CGF);
+
+  OMPLoopScope PreInitScope(CGF, S);
+  if (isOpenMPDistributeDirective(S.getDirectiveKind()) ||
+      isOpenMPWorksharingDirective(S.getDirectiveKind()) ||
+      isOpenMPTaskLoopDirective(S.getDirectiveKind())) {
+    (void)EmitOMPHelperVar(CGF, cast<DeclRefExpr>(S.getLowerBoundVariable()));
+    (void)EmitOMPHelperVar(CGF, cast<DeclRefExpr>(S.getUpperBoundVariable()));
+  }
+
+  bool CondConstant;
+  if (CGF.ConstantFoldsToSimpleInteger(S.getPreCond(), CondConstant)) {
+    // If the condition is evaluated to false, we don't have to move on.
+    if (!CondConstant)
+      return;
+  }
+
+  const CapturedStmt *CS = S.getCapturedStmt(OMPD_simd);
+
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
+    // CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
+    //                            emitDispatchForLoopBounds);
+  };
+
+  llvm::Function *OutlinedFn =
+      CGF.CGM.getOpenMPRuntime().emitSIMDOutlinedFunction(
+          S, *CS->getCapturedDecl()->param_begin(), OMPD_simd, CodeGen);
+
+  llvm::SmallVector<llvm::Value *, 16> CapturedVars;
+  CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
+
+  CGF.CGM.getOpenMPRuntime().emitSIMDCall(CGF, S.getBeginLoc(), OutlinedFn,
+                                          CapturedVars, S.getPreCond());
+}
+
 static void emitOMPSimdRegion(CodeGenFunction &CGF, const OMPLoopDirective &S,
                               PrePostActionTy &Action) {
   Action.Enter(CGF);
@@ -2585,17 +2627,32 @@
 void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
   ParentLoopDirectiveForScanRegion ScanRegion(*this, S);
   OMPFirstScanLoop = true;
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
-    emitOMPSimdRegion(CGF, S, Action);
-  };
-  {
-    auto LPCRegion =
-        CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
-    OMPLexicalScope Scope(*this, S, OMPD_unknown);
-    CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
+  if (CGM.getLangOpts().OpenMPIsDevice) {
+    auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+      emitOMPTargetSimdRegion(CGF, S, Action);
+    };
+    {
+      auto LPCRegion =
+          CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
+      OMPLexicalScope Scope(*this, S, OMPD_unknown);
+      // FIXME: For device code gen, we have to outline the function.
+      CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
+    }
+    // Check for outer lastprivate conditional update.
+    checkForLastprivateConditionalUpdate(*this, S);
+  } else {
+    auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+      emitOMPSimdRegion(CGF, S, Action);
+    };
+    {
+      auto LPCRegion =
+          CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
+      OMPLexicalScope Scope(*this, S, OMPD_unknown);
+      CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
+    }
+    // Check for outer lastprivate conditional update.
+    checkForLastprivateConditionalUpdate(*this, S);
   }
-  // Check for outer lastprivate conditional update.
-  checkForLastprivateConditionalUpdate(*this, S);
 }
 
 void CodeGenFunction::EmitOMPTileDirective(const OMPTileDirective &S) {
@@ -3094,7 +3151,7 @@
     CodeGenModule &CGM, StringRef ParentName, const OMPTargetSimdDirective &S) {
   // Emit SPMD target parallel for region as a standalone region.
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
-    emitOMPSimdRegion(CGF, S, Action);
+    emitOMPTargetSimdRegion(CGF, S, Action);
   };
   llvm::Function *Fn;
   llvm::Constant *Addr;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -3977,7 +3977,38 @@
                              ParamsTeamsOrParallel, /*OpenMPCaptureLevel=*/2);
     break;
   }
-  case OMPD_target:
+  case OMPD_target: {
+    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
+    QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
+    QualType KmpInt32PtrTy =
+        Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
+    QualType Args[] = {VoidPtrTy};
+    FunctionProtoType::ExtProtoInfo EPI;
+    EPI.Variadic = true;
+    QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
+    Sema::CapturedParamNameType Params[] = {
+        std::make_pair(".global_tid.", KmpInt32Ty),
+        std::make_pair(".part_id.", KmpInt32PtrTy),
+        std::make_pair(".privates.", VoidPtrTy),
+        std::make_pair(
+            ".copy_fn.",
+            Context.getPointerType(CopyFnType).withConst().withRestrict()),
+        std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             Params, /*OpenMPCaptureLevel=*/0);
+    // Mark this captured region as inlined, because we don't use outlined
+    // function directly.
+    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+        AlwaysInlineAttr::CreateImplicit(
+            Context, {}, AttributeCommonInfo::AS_Keyword,
+            AlwaysInlineAttr::Keyword_forceinline));
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             std::make_pair(StringRef(), QualType()),
+                             /*OpenMPCaptureLevel=*/1);
+    break;
+  }
   case OMPD_target_simd: {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
     QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
@@ -4008,6 +4039,15 @@
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
                              std::make_pair(StringRef(), QualType()),
                              /*OpenMPCaptureLevel=*/1);
+    Sema::CapturedParamNameType ParamsSIMD[] = {
+        std::make_pair(".global_tid.", KmpInt32PtrTy),
+        std::make_pair(".bound_tid.", KmpInt32PtrTy),
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    // Start a captured region for 'teams' or 'parallel'.  Both regions have
+    // the same implicit parameters.
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             ParamsSIMD, /*OpenMPCaptureLevel=*/2);
     break;
   }
   case OMPD_atomic:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -423,6 +423,11 @@
 __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
+/// OpenMP target SIMD functions
+__OMP_RTL(__kmpc_simd_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
+          VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_kernel_simd, false, Int1, VoidPtrPtr)
+__OMP_RTL(__kmpc_kernel_end_simd, false, Void, )
 __OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)
 __OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
           Int32, SizeTy, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -281,6 +281,12 @@
 /// TODO
 void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
 
+/// TODO
+bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn);
+
+/// TODO
+void __kmpc_kernel_end_simd();
+
 /// TODO
 void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
 
diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h
--- a/openmp/libomptarget/DeviceRTL/include/Mapping.h
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -25,7 +25,7 @@
 #pragma omp end declare target
 
 /// Initialize the mapping machinery.
-void init(bool IsSPMD);
+void init(int8_t Mode);
 
 /// Return true if the kernel is executed in SPMD mode.
 bool isSPMDMode();
@@ -33,6 +33,9 @@
 /// Return true if the kernel is executed in generic mode.
 bool isGenericMode();
 
+/// Return true if the kernel is executed in SIMD mode.
+bool isSIMDMode();
+
 /// Return true if the executing thread is the main thread in generic mode.
 bool isMainThreadInGenericMode();
 
@@ -55,6 +58,12 @@
 /// Return the thread Id in the block, in [0, getBlockSize()).
 uint32_t getThreadIdInBlock();
 
+/// Return the logic thread Id, which depends on how we map an OpenMP thread to
+/// the target device. In non-SIMD mode, we map an OpenMP thread to a device
+/// thread. In SIMD mode, we map an OpenMP thread to a warp, and each thread in
+/// the warp is a SIMD lane.
+uint32_t getLogicThreadId();
+
 /// Return the warp id in the block.
 uint32_t getWarpId();
 
@@ -79,6 +88,19 @@
 /// Return the number of processing elements on the device.
 uint32_t getNumberOfProcessorElements();
 
+namespace utils {
+/// Return true if \p Mode indicates SPMD mode.
+inline bool isSPMDMode(int8_t Mode) { return Mode & OMP_TGT_EXEC_MODE_SPMD; }
+
+/// Return true if \p Mode indicates generic mode.
+inline bool isGenericMode(int8_t Mode) {
+  return Mode & OMP_TGT_EXEC_MODE_GENERIC;
+}
+
+/// Return true if \p Mode indicates SIMD mode.
+inline bool isSIMDMode(int8_t Mode) { return Mode & OMP_TGT_EXEC_MODE_SIMD; }
+} // namespace utils
+
 } // namespace mapping
 
 } // namespace _OMP
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -24,7 +24,7 @@
 inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
 
 /// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD);
+void init(int8_t Mode);
 
 /// TODO
 enum ValueKind {
@@ -37,6 +37,10 @@
   VK_RunSchedChunk,
   VK_ParallelRegionFn,
   VK_ParallelTeamSize,
+  // SIMD
+  VK_SIMDLevel,
+  VK_SIMDRegionFn,
+  VK_SIMDLaneWidth,
 };
 
 /// TODO
@@ -145,10 +149,20 @@
 inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
     ParallelRegionFn;
 
+/// TODO
+inline state::Value<uint32_t, state::VK_SIMDLaneWidth> SIMDLaneWidth;
+
+/// TODO
+inline state::PtrValue<SIMDRegionFnTy, state::VK_SIMDRegionFn> SIMDRegionFn;
+
 void runAndCheckState(void(Func(void)));
 
 void assumeInitialState(bool IsSPMD);
 
+/// Propagate the thread state from the leader in the warp to the rest of SIMD
+/// workers. This function should only be called in SIMD mode.
+void propagateThreadState(unsigned SIMDLen);
+
 } // namespace state
 
 namespace icv {
@@ -171,6 +185,9 @@
 /// TODO
 inline state::Value<uint32_t, state::VK_RunSched> RunSched;
 
+/// TODO
+inline state::Value<uint32_t, state::VK_SIMDLevel> SIMDLevel;
+
 } // namespace icv
 
 namespace memory {
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -19,7 +19,7 @@
 namespace synchronize {
 
 /// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
+void init(int8_t Mode);
 
 /// Synchronize all threads in a warp identified by \p Mask.
 void warp(LaneMaskTy Mask);
diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h
--- a/openmp/libomptarget/DeviceRTL/include/Types.h
+++ b/openmp/libomptarget/DeviceRTL/include/Types.h
@@ -150,6 +150,8 @@
 
 using ParallelRegionFnTy = void *;
 
+using SIMDRegionFnTy = void *;
+
 using CriticalNameTy = int32_t[8];
 
 struct omp_lock_t {
@@ -181,6 +183,7 @@
 enum OMPTgtExecModeFlags : int8_t {
   OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
   OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
+  OMP_TGT_EXEC_MODE_SIMD = 1 << 2,
 };
 
 #define __PRAGMA(STR) _Pragma(#STR)
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -21,17 +21,17 @@
 
 #pragma omp declare target
 
-static void inititializeRuntime(bool IsSPMD) {
+static void inititializeRuntime(int8_t Mode) {
   // Order is important here.
-  synchronize::init(IsSPMD);
-  mapping::init(IsSPMD);
-  state::init(IsSPMD);
+  synchronize::init(Mode);
+  mapping::init(Mode);
+  state::init(Mode);
 }
 
 /// Simple generic state machine for worker threads.
 static void genericStateMachine(IdentTy *Ident) {
 
-  uint32_t TId = mapping::getThreadIdInBlock();
+  uint32_t TId = mapping::getLogicThreadId();
 
   do {
     ParallelRegionFnTy WorkFn = 0;
@@ -58,6 +58,31 @@
   } while (true);
 }
 
+namespace {
+void runSIMDStateMachine(IdentTy *Ident) {
+  uint32_t LaneId = mapping::getThreadIdInWarp();
+  do {
+    SIMDRegionFnTy WorkFn = nullptr;
+
+    // Wait for the signal that we have a new work function.
+    synchronize::warp(mapping::activemask());
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_simd(&WorkFn);
+
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      ((void (*)(uint32_t, uint32_t))WorkFn)(0, LaneId);
+      __kmpc_kernel_end_simd();
+    }
+
+    synchronize::warp(mapping::activemask());
+  } while (true);
+}
+} // namespace
+
 extern "C" {
 
 /// Initialization
@@ -66,17 +91,20 @@
 ///
 int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
                            bool UseGenericStateMachine, bool) {
-  const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
-  if (IsSPMD) {
-    inititializeRuntime(/* IsSPMD */ true);
-    synchronize::threads();
-  } else {
-    inititializeRuntime(/* IsSPMD */ false);
-    // No need to wait since only the main threads will execute user
-    // code and workers will run into a barrier right away.
+  inititializeRuntime(Mode);
+
+  // For all SIMD workers, start the simd state machine.
+  if (mapping::utils::isSIMDMode(Mode)) {
+    uint32_t LaneId = mapping::getThreadIdInWarp();
+    if (LaneId) {
+      runSIMDStateMachine(Ident);
+      return LaneId;
+    }
   }
 
+  const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
   if (IsSPMD) {
+    synchronize::threads();
     state::assumeInitialState(IsSPMD);
     return -1;
   }
@@ -98,7 +126,8 @@
 /// \param Ident Source location identification, can be NULL.
 ///
 void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) {
-  const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
+  const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
+
   state::assumeInitialState(IsSPMD);
   if (IsSPMD)
     return;
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -178,7 +178,7 @@
 bool mapping::isLeaderInWarp() {
   __kmpc_impl_lanemask_t Active = mapping::activemask();
   __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
-  return utils::popc(Active & LaneMaskLT) == 0;
+  return ::_OMP::utils::popc(Active & LaneMaskLT) == 0;
 }
 
 LaneMaskTy mapping::activemask() { return impl::activemask(); }
@@ -191,6 +191,13 @@
 
 uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
 
+uint32_t mapping::getLogicThreadId() {
+  if (mapping::isSIMDMode())
+    return mapping::getWarpId();
+
+  return mapping::getThreadIdInBlock();
+}
+
 uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
 
 uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
@@ -214,16 +221,20 @@
 /// Execution mode
 ///
 ///{
-static int SHARED(IsSPMDMode);
+static int8_t SHARED(ExecutionMode);
 
-void mapping::init(bool IsSPMD) {
+void mapping::init(int8_t Mode) {
   if (!mapping::getThreadIdInBlock())
-    IsSPMDMode = IsSPMD;
+    ExecutionMode = Mode;
 }
 
-bool mapping::isSPMDMode() { return IsSPMDMode; }
+bool mapping::isSPMDMode() { return mapping::utils::isSPMDMode(ExecutionMode); }
 
-bool mapping::isGenericMode() { return !isSPMDMode(); }
+bool mapping::isGenericMode() {
+  return mapping::utils::isGenericMode(ExecutionMode);
+}
+
+bool mapping::isSIMDMode() { return mapping::utils::isSIMDMode(ExecutionMode); }
 ///}
 
 extern "C" {
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -49,20 +49,43 @@
 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
   uint32_t NThreadsICV =
       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
-  uint32_t NumThreads = mapping::getBlockSize();
+
+  const bool IsSIMDMode = mapping::isSIMDMode();
+
+  uint32_t NumThreads =
+      IsSIMDMode ? mapping::getNumberOfWarpsInBlock() : mapping::getBlockSize();
 
   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
     NumThreads = NThreadsICV;
 
   // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (NumThreads < mapping::getWarpSize())
-    NumThreads = 1;
-  else
-    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+  // We don't need this for SIMD mode because an OpenMP thread is mapped to a
+  // warp on the device and it can be any number.
+  if (!IsSIMDMode) {
+    if (NumThreads < mapping::getWarpSize())
+      NumThreads = 1;
+    else
+      NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+  }
 
   return NumThreads;
 }
 
+uint32_t determineSIMDLen(int32_t SIMDLen, int32_t SafeLen) {
+  ASSERT(mapping::isSIMDMode());
+
+  // TODO: This is probably not right if the schedule is different.
+  if (SafeLen < SIMDLen)
+    SIMDLen = SafeLen;
+
+  // We currently maps an OpenMP thread to a warp in SIMD mode. If the simdlen
+  // is larger than the warp size, we have to ceil it.
+  if (SIMDLen > mapping::getWarpSize())
+    SIMDLen = mapping::getWarpSize();
+
+  return SIMDLen;
+}
+
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
                      void **args, int64_t nargs) {
@@ -78,11 +101,57 @@
 
 extern "C" {
 
+void __kmpc_simd_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t safelen,
+                    int32_t simdlen, int32_t order, void *fn, void *wrapper_fn,
+                    void **args, int64_t nargs) {
+  // Handle non-SIMD case first, which can be:
+  // - if clause is evaluted to false
+  // - simdlen is set to 1
+  // - it is already in simd region
+  const uint32_t LogicThreadId = mapping::getLogicThreadId();
+  if (OMP_UNLIKELY(!if_expr || simdlen == 1 || safelen == 1 ||
+                   icv::SIMDLevel)) {
+    invokeMicrotask(LogicThreadId, 0, fn, args, nargs);
+    return;
+  }
+
+  // Only the leader of each warp can execute the following code.
+  ASSERT(mapping::isLeaderInWarp());
+
+  const uint32_t SIMDLen = determineSIMDLen(simdlen, safelen);
+
+  if (LogicThreadId == 0)
+    state::SIMDLaneWidth = SIMDLen;
+
+  // Propagates the thread state to all SIMD workers from the leader.
+  state::propagateThreadState(SIMDLen);
+
+  // Synchronize all threads (leaders).
+  synchronize::threads();
+
+  {
+    state::ValueRAII SIMDRegionFnRAII(state::SIMDRegionFn, wrapper_fn,
+                                      (void *)nullptr, true);
+    state::ValueRAII SIMDLevelRAII(icv::SIMDLevel, 1u, 0u, true);
+
+    // Signal SIMD workers
+    synchronize::warp(mapping::activemask());
+
+    // TODO: Leader in warp also has to execute the SIMD region.
+    // What we need:
+    // - A work-sharing function that can take both thread id and lane id into
+    // consideration.
+
+    // Synchronize after execution of the SIMD region.
+    synchronize::warp(mapping::activemask());
+  }
+}
+
 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                         int32_t num_threads, int proc_bind, void *fn,
                         void *wrapper_fn, void **args, int64_t nargs) {
 
-  uint32_t TId = mapping::getThreadIdInBlock();
+  uint32_t TId = mapping::getLogicThreadId();
   // Handle the serialized case first, same for SPMD/non-SPMD.
   if (OMP_UNLIKELY(!if_expr || icv::Level)) {
     __kmpc_serialized_parallel(ident, TId);
@@ -157,7 +226,7 @@
     return false;
 
   // Set to true for workers participating in the parallel region.
-  uint32_t TId = mapping::getThreadIdInBlock();
+  uint32_t TId = mapping::getLogicThreadId();
   bool ThreadIsActive = TId < state::ParallelTeamSize;
   return ThreadIsActive;
 }
@@ -171,6 +240,24 @@
   ASSERT(!mapping::isSPMDMode());
 }
 
+__attribute__((noinline)) bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn) {
+  // Work function and arguments for L1 SIMD region.
+  *WorkFn = state::SIMDRegionFn;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn)
+    return false;
+
+  // Set to true for workers participating in the parallel region.
+  uint32_t LaneId = mapping::getThreadIdInWarp();
+  bool LaneActive = LaneId < state::SIMDLaneWidth;
+  return LaneActive;
+}
+
+__attribute__((noinline)) void __kmpc_kernel_end_simd() {
+  // TODO: Some clean-up of SIMD execution
+}
+
 void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
   state::enterDataEnvironment();
   ++icv::Level;
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -203,7 +203,7 @@
 
 struct TeamStateTy {
   /// TODO: provide a proper init function.
-  void init(bool IsSPMD);
+  void init(int Mode);
 
   bool operator==(const TeamStateTy &) const;
 
@@ -224,8 +224,13 @@
 
 TeamStateTy SHARED(TeamState);
 
-void TeamStateTy::init(bool IsSPMD) {
-  ICVState.NThreadsVar = mapping::getBlockSize();
+void TeamStateTy::init(int Mode) {
+  // In SIMD mode, we map an OpenMP thread to a warp.
+  if (mapping::utils::isSIMDMode(Mode))
+    ICVState.NThreadsVar = mapping::getNumberOfWarpsInBlock();
+  else
+    ICVState.NThreadsVar = mapping::getBlockSize();
+
   ICVState.LevelVar = 0;
   ICVState.ActiveLevelVar = 0;
   ICVState.MaxActiveLevelsVar = 1;
@@ -357,7 +362,8 @@
   __builtin_unreachable();
 }
 
-void state::init(bool IsSPMD) {
+void state::init(int8_t Mode) {
+  const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
   SharedMemorySmartStack.init(IsSPMD);
   if (!mapping::getThreadIdInBlock())
     TeamState.init(IsSPMD);
@@ -404,6 +410,15 @@
   ASSERT(mapping::isSPMDMode() == IsSPMD);
 }
 
+void state::propagateThreadState(unsigned SIMDLen) {
+  ASSERT(mapping::isSIMDMode());
+  ASSERT(mapping::isLeaderInWarp());
+
+  const uint32_t TId = mapping::getThreadIdInBlock();
+  for (int I = 1; I < SIMDLen; ++I)
+    ThreadStates[I + TId] = ThreadStates[TId];
+}
+
 extern "C" {
 void omp_set_dynamic(int V) {}
 
@@ -434,7 +449,7 @@
 }
 
 int omp_get_ancestor_thread_num(int Level) {
-  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+  return returnValIfLevelIsActive(Level, mapping::getLogicThreadId(), 0);
 }
 
 int omp_get_thread_num(void) {
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -214,8 +214,8 @@
 
 } // namespace impl
 
-void synchronize::init(bool IsSPMD) {
-  if (!IsSPMD)
+void synchronize::init(int8_t Mode) {
+  if (!mapping::utils::isSPMDMode(Mode) || mapping::utils::isSIMDMode(Mode))
     impl::namedBarrierInit();
 }
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -210,7 +210,7 @@
   static void dispatch_init(IdentTy *loc, int32_t threadId,
                             kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
                             DynamicScheduleTracker *DST) {
-    int tid = mapping::getThreadIdInBlock();
+    int tid = mapping::getLogicThreadId();
     T tnum = omp_get_num_threads();
     T tripCount = ub - lb + 1; // +1 because ub is inclusive
     ASSERT0(LT_FUSSY, threadId < tnum,
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -1084,20 +1084,31 @@
     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
 
     const bool IsSPMDGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+        KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
     const bool IsSPMDMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
+        !IsSPMDGenericMode &&
+        (KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_SPMD);
     const bool IsGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
+        !IsSPMDGenericMode &&
+        (KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_GENERIC);
+    const bool IsSIMDMode =
+        KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_SIMD;
 
     int CudaThreadsPerBlock;
     if (ThreadLimit > 0) {
-      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
-      CudaThreadsPerBlock = ThreadLimit;
-      // Add master warp if necessary
-      if (IsGenericMode) {
-        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
-        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+      if (IsSIMDMode) {
+        DP("Setting CUDA threads per block to requested %d\n",
+           ThreadLimit * DeviceData[DeviceId].WarpSize);
+        CudaThreadsPerBlock = ThreadLimit * DeviceData[DeviceId].WarpSize;
+      } else {
+        DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
+        CudaThreadsPerBlock = ThreadLimit;
+        // Add master warp if necessary
+        if (IsGenericMode) {
+          DP("Adding master warp: +%d threads\n",
+             DeviceData[DeviceId].WarpSize);
+          CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+        }
       }
     } else {
       DP("Setting CUDA threads per block to default %d\n",