diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2568,7 +2568,11 @@
    */
   CXCursor_OMPScanDirective = 287,
 
-  CXCursor_LastStmt = CXCursor_OMPScanDirective,
+  /** OpenMP canonical loop.
+   */
+  CXCursor_OMPCanonicalLoop = 288,
+
+  CXCursor_LastStmt = CXCursor_OMPCanonicalLoop,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2787,6 +2787,14 @@
   return true;
 }
 
+DEF_TRAVERSE_STMT(OMPCanonicalLoop, {
+  if (!getDerived().shouldVisitImplicitCode()) {
+    // Visit only the syntactical loop.
+    TRY_TO(TraverseStmt(S->getLoopStmt()));
+    ShouldVisitChildren = false;
+  }
+})
+
 template <typename Derived>
 bool
 RecursiveASTVisitor<Derived>::TraverseOMPLoopDirective(OMPLoopDirective *S) {
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -28,6 +28,209 @@
 // AST classes for directives.
 //===----------------------------------------------------------------------===//
 
+/// Representation of an OpenMP canonical loop.
+///
+/// OpenMP 1.0 C/C++, section 2.4.1 for Construct; canonical-shape
+/// OpenMP 2.0 C/C++, section 2.4.1 for Construct; canonical-shape
+/// OpenMP 2.5, section 2.5.1 Loop Construct; canonical form
+/// OpenMP 3.1, section 2.5.1 Loop Construct; canonical form
+/// OpenMP 4.0, section 2.6 Canonical Loop Form
+/// OpenMP 4.5, section 2.6 Canonical Loop Form
+/// OpenMP 5.0, section 2.9.1 Canonical Loop Form
+/// OpenMP 5.1, section 2.11.1 Canonical Loop Nest Form
+///
+/// An OpenMP canonical loop is a for-statement or range-based for-statement
+/// with additional requirements that ensure that the number of iterations is
+/// known before entering the loop and allow skipping to an arbitrary iteration.
+/// The OMPCanonicalLoop AST node wraps a ForStmt or CXXForRangeStmt that is
+/// known to fulfill OpenMP's canonical loop requirements.
+///
+/// There are three different kinds of iteration variables for different
+/// purposes:
+/// * Loop user variable: The user-accessible variable with different value for
+///   each iteration.
+/// * Loop iteration variable: The variable used to identify a loop iteration;
+///   for range-based for-statement, this is the hidden iterator '__begin'. For
+///   other loops, it is identical to the loop user variable. Must be a
+///   random-access iterator, pointer or integer type.
+/// * Logical iteration counter: Normalized loop counter starting at 0 and
+///   incrementing by one at each iteration. Allows abstracting over the type
+///   of the loop iteration variable and is always an unsigned integer type
+///   appropriate to represent the range of the loop iteration variable. Its
+///   value corresponds to the logical iteration number in the OpenMP
+///   specification.
+///
+/// This AST node provides two captured statements:
+/// * The distance function which computes the number of iterations.
+/// * The loop user variable function that computes the loop user variable when
+///   given a logical iteration number.
+///
+/// These captured statements provide the link between C/C++ semantics and the
+/// logical iteration counters used by the OpenMPIRBuilder which is
+/// language-agnostic and therefore does not know e.g. how to advance a
+/// random-access iterator. The OpenMPIRBuilder will use this information to
+/// apply simd, workshare-loop, distribute, taskloop and loop directives to the
+/// loop. For compatibility with the non-OpenMPIRBuilder codegen path, an
+/// OMPCanonicalLoop can itself also be wrapped into the CapturedStmts of an
+/// OMPLoopDirective and skipped when searching for the associated syntactical
+/// loop.
+///
+/// Example:
+/// <code>
+///   std::vector<std::string> Container{1,2,3};
+///   for (std::string Str : Container)
+///      Body(Str);
+/// </code>
+/// which is syntactic sugar for approximately:
+/// <code>
+///   auto &&__range = Container;
+///   auto __begin = std::begin(__range);
+///   auto __end = std::end(__range);
+///   for (; __begin != __end; ++__begin) {
+///     std::String Str = *__begin;
+///     Body(Str);
+///   }
+/// </code>
+/// In this example, the loop user variable is `Str`, the loop iteration
+/// variable is `__begin` of type `std::vector<std::string>::iterator` and the
+/// logical iteration number type is `size_t` (unsigned version of
+/// `std::vector<std::string>::iterator::difference_type` aka `ptrdiff_t`).
+/// Therefore, the distance function will be
+/// <code>
+///   [&](size_t &Result) { Result = __end - __begin; }
+/// </code>
+/// and the loop variable function is
+/// <code>
+///   [&,__begin](std::vector<std::string>::iterator &Result, size_t Logical) {
+///     Result = __begin + Logical;
+///   }
+/// </code>
+/// The variable `__begin`, aka the loop iteration variable, is captured by
+/// value because it is modified in the loop body, but both functions require
+/// the initial value. The OpenMP specification explicitly leaves unspecified
+/// when the loop expressions are evaluated such that a capture by reference is
+/// sufficient.
+class OMPCanonicalLoop : public Stmt {
+  friend class ASTStmtReader;
+  friend class ASTStmtWriter;
+
+  /// Children of this AST node.
+  enum {
+    LOOP_STMT,
+    DISTANCE_FUNC,
+    LOOPVAR_FUNC,
+    LOOPVAR_REF,
+    LastSubStmt = LOOPVAR_REF
+  };
+
+private:
+  /// This AST node's children.
+  Stmt *SubStmts[LastSubStmt + 1] = {};
+
+  OMPCanonicalLoop() : Stmt(StmtClass::OMPCanonicalLoopClass) {}
+
+public:
+  /// Create a new OMPCanonicalLoop.
+  static OMPCanonicalLoop *create(const ASTContext &Ctx, Stmt *LoopStmt,
+                                  CapturedStmt *DistanceFunc,
+                                  CapturedStmt *LoopVarFunc,
+                                  DeclRefExpr *LoopVarRef) {
+    OMPCanonicalLoop *S = new (Ctx) OMPCanonicalLoop();
+    S->setLoopStmt(LoopStmt);
+    S->setDistanceFunc(DistanceFunc);
+    S->setLoopVarFunc(LoopVarFunc);
+    S->setLoopVarRef(LoopVarRef);
+    return S;
+  }
+
+  /// Create an empty OMPCanonicalLoop for deserialization.
+  static OMPCanonicalLoop *createEmpty(const ASTContext &Ctx) {
+    return new (Ctx) OMPCanonicalLoop();
+  }
+
+  static bool classof(const Stmt *S) {
+    return S->getStmtClass() == StmtClass::OMPCanonicalLoopClass;
+  }
+
+  SourceLocation getBeginLoc() const { return getLoopStmt()->getBeginLoc(); }
+  SourceLocation getEndLoc() const { return getLoopStmt()->getEndLoc(); }
+
+  /// Return this AST node's children.
+  /// @{
+  child_range children() {
+    return child_range(&SubStmts[0], &SubStmts[0] + LastSubStmt + 1);
+  }
+  const_child_range children() const {
+    return const_child_range(&SubStmts[0], &SubStmts[0] + LastSubStmt + 1);
+  }
+  /// @}
+
+  /// The wrapped syntactic loop statement (ForStmt or CXXForRangeStmt).
+  /// @{
+  Stmt *getLoopStmt() { return SubStmts[LOOP_STMT]; }
+  const Stmt *getLoopStmt() const { return SubStmts[LOOP_STMT]; }
+  void setLoopStmt(Stmt *S) {
+    assert((isa<ForStmt>(S) || isa<CXXForRangeStmt>(S)) &&
+           "Canonical loop must be a for loop (range-based or otherwise)");
+    SubStmts[LOOP_STMT] = S;
+  }
+  /// @}
+
+  /// The function that computes the number of loop iterations. Can be evaluated
+  /// before entering the loop but after the syntactical loop's init
+  /// statement(s).
+  ///
+  /// Function signature: void(LogicalTy &Result)
+  /// Any values necessary to compute the distance are captures of the closure.
+  /// @{
+  CapturedStmt *getDistanceFunc() {
+    return cast<CapturedStmt>(SubStmts[DISTANCE_FUNC]);
+  }
+  const CapturedStmt *getDistanceFunc() const {
+    return cast<CapturedStmt>(SubStmts[DISTANCE_FUNC]);
+  }
+  void setDistanceFunc(CapturedStmt *S) {
+    assert(S && "Expected non-null captured statement");
+    SubStmts[DISTANCE_FUNC] = S;
+  }
+  /// @}
+
+  /// The function that computes the loop user variable from a logical iteration
+  /// counter. Can be evaluated as first statement in the loop.
+  ///
+  /// Function signature: void(LoopVarTy &Result, LogicalTy Number)
+  /// Any other values required to compute the loop user variable (such as start
+  /// value, step size) are captured by the closure. In particular, the initial
+  /// value of loop iteration variable is captured by value to be unaffected by
+  /// previous iterations.
+  /// @{
+  CapturedStmt *getLoopVarFunc() {
+    return cast<CapturedStmt>(SubStmts[LOOPVAR_FUNC]);
+  }
+  const CapturedStmt *getLoopVarFunc() const {
+    return cast<CapturedStmt>(SubStmts[LOOPVAR_FUNC]);
+  }
+  void setLoopVarFunc(CapturedStmt *S) {
+    assert(S && "Expected non-null captured statement");
+    SubStmts[LOOPVAR_FUNC] = S;
+  }
+  /// @}
+
+  /// Reference to the loop user variable as accessed in the loop body.
+  /// @{
+  DeclRefExpr *getLoopVarRef() {
+    return cast<DeclRefExpr>(SubStmts[LOOPVAR_REF]);
+  }
+  const DeclRefExpr *getLoopVarRef() const {
+    return cast<DeclRefExpr>(SubStmts[LOOPVAR_REF]);
+  }
+  void setLoopVarRef(DeclRefExpr *E) {
+    assert(E && "Expected non-null loop variable");
+    SubStmts[LOOPVAR_REF] = E;
+  }
+  /// @}
+};
+
 /// This is a basic class for representing single OpenMP executable
 /// directive.
 ///
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -216,6 +216,7 @@
 def AsTypeExpr : StmtNode<Expr>;
 
 // OpenMP Directives.
+def OMPCanonicalLoop : StmtNode<Stmt>;
 def OMPExecutableDirective : StmtNode<Stmt, 1>;
 def OMPLoopDirective : StmtNode<OMPExecutableDirective, 1>;
 def OMPParallelDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10482,6 +10482,11 @@
 
   /// Initialization of captured region for OpenMP region.
   void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope);
+
+  /// Called for syntactical loops (ForStmt for CXXRangeForStmt) associated to
+  /// an OpenMP loop directive.
+  StmtResult ActOnOpenMPCanonicalLoop(Stmt *AStmt);
+
   /// End of OpenMP region.
   ///
   /// \param S Statement associated with the current OpenMP region.
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1830,21 +1830,21 @@
       /// A CXXBoolLiteralExpr record.
       EXPR_CXX_BOOL_LITERAL,
 
-      EXPR_CXX_NULL_PTR_LITERAL,  // CXXNullPtrLiteralExpr
-      EXPR_CXX_TYPEID_EXPR,       // CXXTypeidExpr (of expr).
-      EXPR_CXX_TYPEID_TYPE,       // CXXTypeidExpr (of type).
-      EXPR_CXX_THIS,              // CXXThisExpr
-      EXPR_CXX_THROW,             // CXXThrowExpr
-      EXPR_CXX_DEFAULT_ARG,       // CXXDefaultArgExpr
-      EXPR_CXX_DEFAULT_INIT,      // CXXDefaultInitExpr
-      EXPR_CXX_BIND_TEMPORARY,    // CXXBindTemporaryExpr
+      EXPR_CXX_NULL_PTR_LITERAL, // CXXNullPtrLiteralExpr
+      EXPR_CXX_TYPEID_EXPR,      // CXXTypeidExpr (of expr).
+      EXPR_CXX_TYPEID_TYPE,      // CXXTypeidExpr (of type).
+      EXPR_CXX_THIS,             // CXXThisExpr
+      EXPR_CXX_THROW,            // CXXThrowExpr
+      EXPR_CXX_DEFAULT_ARG,      // CXXDefaultArgExpr
+      EXPR_CXX_DEFAULT_INIT,     // CXXDefaultInitExpr
+      EXPR_CXX_BIND_TEMPORARY,   // CXXBindTemporaryExpr
 
       EXPR_CXX_SCALAR_VALUE_INIT, // CXXScalarValueInitExpr
       EXPR_CXX_NEW,               // CXXNewExpr
       EXPR_CXX_DELETE,            // CXXDeleteExpr
       EXPR_CXX_PSEUDO_DESTRUCTOR, // CXXPseudoDestructorExpr
 
-      EXPR_EXPR_WITH_CLEANUPS,    // ExprWithCleanups
+      EXPR_EXPR_WITH_CLEANUPS, // ExprWithCleanups
 
       EXPR_CXX_DEPENDENT_SCOPE_MEMBER,   // CXXDependentScopeMemberExpr
       EXPR_CXX_DEPENDENT_SCOPE_DECL_REF, // DependentScopeDeclRefExpr
@@ -1852,41 +1852,42 @@
       EXPR_CXX_UNRESOLVED_MEMBER,        // UnresolvedMemberExpr
       EXPR_CXX_UNRESOLVED_LOOKUP,        // UnresolvedLookupExpr
 
-      EXPR_CXX_EXPRESSION_TRAIT,  // ExpressionTraitExpr
-      EXPR_CXX_NOEXCEPT,          // CXXNoexceptExpr
+      EXPR_CXX_EXPRESSION_TRAIT, // ExpressionTraitExpr
+      EXPR_CXX_NOEXCEPT,         // CXXNoexceptExpr
 
-      EXPR_OPAQUE_VALUE,          // OpaqueValueExpr
-      EXPR_BINARY_CONDITIONAL_OPERATOR,  // BinaryConditionalOperator
-      EXPR_TYPE_TRAIT,            // TypeTraitExpr
-      EXPR_ARRAY_TYPE_TRAIT,      // ArrayTypeTraitIntExpr
+      EXPR_OPAQUE_VALUE,                // OpaqueValueExpr
+      EXPR_BINARY_CONDITIONAL_OPERATOR, // BinaryConditionalOperator
+      EXPR_TYPE_TRAIT,                  // TypeTraitExpr
+      EXPR_ARRAY_TYPE_TRAIT,            // ArrayTypeTraitIntExpr
 
-      EXPR_PACK_EXPANSION,        // PackExpansionExpr
-      EXPR_SIZEOF_PACK,           // SizeOfPackExpr
-      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM, // SubstNonTypeTemplateParmExpr
-      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK,// SubstNonTypeTemplateParmPackExpr
-      EXPR_FUNCTION_PARM_PACK,    // FunctionParmPackExpr
-      EXPR_MATERIALIZE_TEMPORARY, // MaterializeTemporaryExpr
-      EXPR_CXX_FOLD,              // CXXFoldExpr
-      EXPR_CONCEPT_SPECIALIZATION,// ConceptSpecializationExpr
-      EXPR_REQUIRES,              // RequiresExpr
+      EXPR_PACK_EXPANSION,                    // PackExpansionExpr
+      EXPR_SIZEOF_PACK,                       // SizeOfPackExpr
+      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM,      // SubstNonTypeTemplateParmExpr
+      EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK, // SubstNonTypeTemplateParmPackExpr
+      EXPR_FUNCTION_PARM_PACK,                // FunctionParmPackExpr
+      EXPR_MATERIALIZE_TEMPORARY,             // MaterializeTemporaryExpr
+      EXPR_CXX_FOLD,                          // CXXFoldExpr
+      EXPR_CONCEPT_SPECIALIZATION,            // ConceptSpecializationExpr
+      EXPR_REQUIRES,                          // RequiresExpr
 
       // CUDA
-      EXPR_CUDA_KERNEL_CALL,       // CUDAKernelCallExpr
+      EXPR_CUDA_KERNEL_CALL, // CUDAKernelCallExpr
 
       // OpenCL
-      EXPR_ASTYPE,                 // AsTypeExpr
+      EXPR_ASTYPE, // AsTypeExpr
 
       // Microsoft
-      EXPR_CXX_PROPERTY_REF_EXPR, // MSPropertyRefExpr
+      EXPR_CXX_PROPERTY_REF_EXPR,       // MSPropertyRefExpr
       EXPR_CXX_PROPERTY_SUBSCRIPT_EXPR, // MSPropertySubscriptExpr
-      EXPR_CXX_UUIDOF_EXPR,       // CXXUuidofExpr (of expr).
-      EXPR_CXX_UUIDOF_TYPE,       // CXXUuidofExpr (of type).
-      STMT_SEH_LEAVE,             // SEHLeaveStmt
-      STMT_SEH_EXCEPT,            // SEHExceptStmt
-      STMT_SEH_FINALLY,           // SEHFinallyStmt
-      STMT_SEH_TRY,               // SEHTryStmt
+      EXPR_CXX_UUIDOF_EXPR,             // CXXUuidofExpr (of expr).
+      EXPR_CXX_UUIDOF_TYPE,             // CXXUuidofExpr (of type).
+      STMT_SEH_LEAVE,                   // SEHLeaveStmt
+      STMT_SEH_EXCEPT,                  // SEHExceptStmt
+      STMT_SEH_FINALLY,                 // SEHFinallyStmt
+      STMT_SEH_TRY,                     // SEHTryStmt
 
       // OpenMP directives
+      STMT_OMP_CANONICAL_LOOP,
       STMT_OMP_PARALLEL_DIRECTIVE,
       STMT_OMP_SIMD_DIRECTIVE,
       STMT_OMP_FOR_DIRECTIVE,
@@ -1946,10 +1947,10 @@
       EXPR_OMP_ITERATOR,
 
       // ARC
-      EXPR_OBJC_BRIDGED_CAST,     // ObjCBridgedCastExpr
+      EXPR_OBJC_BRIDGED_CAST, // ObjCBridgedCastExpr
 
-      STMT_MS_DEPENDENT_EXISTS,   // MSDependentExistsStmt
-      EXPR_LAMBDA,                // LambdaExpr
+      STMT_MS_DEPENDENT_EXISTS, // MSDependentExistsStmt
+      EXPR_LAMBDA,              // LambdaExpr
       STMT_COROUTINE_BODY,
       STMT_CORETURN,
       EXPR_COAWAIT,
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -1266,13 +1266,6 @@
     break;
   case VCK_ByCopy:
     assert(Var && "capturing by copy must have a variable!");
-    assert(
-        (Var->getType()->isScalarType() || (Var->getType()->isReferenceType() &&
-                                            Var->getType()
-                                                ->castAs<ReferenceType>()
-                                                ->getPointeeType()
-                                                ->isScalarType())) &&
-        "captures by copy are expected to have a scalar type!");
     break;
   case VCK_VLAType:
     assert(!Var &&
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -91,6 +91,8 @@
         for (Stmt *S : CS->body()) {
           if (!S)
             continue;
+          if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(S))
+            S = CanonLoop->getLoopStmt();
           if (isa<ForStmt>(S) || isa<CXXForRangeStmt>(S)) {
             // Only single loop construct is allowed.
             if (CurStmt) {
@@ -121,6 +123,8 @@
 Stmt *OMPLoopDirective::getBody() {
   // This relies on the loop form is already checked by Sema.
   Stmt *Body = Data->getRawStmt()->IgnoreContainers();
+  if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(Body))
+    Body = CanonLoop->getLoopStmt();
   if (auto *For = dyn_cast<ForStmt>(Body)) {
     Body = For->getBody();
   } else {
@@ -130,6 +134,8 @@
   }
   for (unsigned Cnt = 1; Cnt < CollapsedNum; ++Cnt) {
     Body = tryToFindNextInnerLoop(Body, /*TryImperfectlyNestedLoops=*/true);
+    if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(Body))
+      Body = CanonLoop->getLoopStmt();
     if (auto *For = dyn_cast<ForStmt>(Body)) {
       Body = For->getBody();
     } else {
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -636,6 +636,10 @@
 //  OpenMP directives printing methods
 //===----------------------------------------------------------------------===//
 
+void StmtPrinter::VisitOMPCanonicalLoop(OMPCanonicalLoop *Node) {
+  PrintStmt(Node->getLoopStmt());
+}
+
 void StmtPrinter::PrintOMPExecutableDirective(OMPExecutableDirective *S,
                                               bool ForceNoStmt) {
   OMPClausePrinter Printer(OS, Policy);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -848,6 +848,10 @@
       P.Visit(*I);
 }
 
+void StmtProfiler::VisitOMPCanonicalLoop(const OMPCanonicalLoop *L) {
+  VisitStmt(L);
+}
+
 void StmtProfiler::VisitOMPLoopDirective(const OMPLoopDirective *S) {
   VisitOMPExecutableDirective(S);
 }
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -194,6 +194,9 @@
   case Stmt::SEHTryStmtClass:
     EmitSEHTryStmt(cast<SEHTryStmt>(*S));
     break;
+  case Stmt::OMPCanonicalLoopClass:
+    EmitOMPCanonicalLoop(cast<OMPCanonicalLoop>(S));
+    break;
   case Stmt::OMPParallelDirectiveClass:
     EmitOMPParallelDirective(cast<OMPParallelDirective>(*S));
     break;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -163,7 +163,9 @@
     for (unsigned Cnt = 0; Cnt < S.getCollapsedNumber(); ++Cnt) {
       Body = OMPLoopDirective::tryToFindNextInnerLoop(
           Body, /*TryImperfectlyNestedLoops=*/true);
-      if (auto *For = dyn_cast<ForStmt>(Body)) {
+      if (const auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(Body))
+        Body = CanonLoop->getLoopStmt();
+      if (const auto *For = dyn_cast<ForStmt>(Body)) {
         Body = For->getBody();
       } else {
         assert(isa<CXXForRangeStmt>(Body) &&
@@ -1771,6 +1773,8 @@
     return;
   }
   if (SimplifiedS == NextLoop) {
+    if (const auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(SimplifiedS))
+      SimplifiedS = CanonLoop->getLoopStmt();
     if (const auto *For = dyn_cast<ForStmt>(SimplifiedS)) {
       S = For->getBody();
     } else {
@@ -1858,6 +1862,121 @@
   BreakContinueStack.pop_back();
 }
 
+using EmittedClosureTy = std::pair<llvm::Function *, llvm::Value *>;
+
+/// Emit a captured statement and return the function as well as its captured
+/// closure context.
+static EmittedClosureTy emitCapturedStmtFunc(CodeGenFunction &ParentCGF,
+                                             const CapturedStmt *S) {
+  LValue CapStruct = ParentCGF.InitCapturedStruct(*S);
+  CodeGenFunction CGF(ParentCGF.CGM, true);
+  std::unique_ptr<CodeGenFunction::CGCapturedStmtInfo> CSI =
+      std::make_unique<CodeGenFunction::CGCapturedStmtInfo>(*S);
+  CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, CSI.get());
+  llvm::Function *F = CGF.GenerateCapturedStmtFunction(*S);
+
+  return {F, CapStruct.getPointer(ParentCGF)};
+}
+
+/// Emit a call to a previously captured closure.
+static llvm::CallInst *
+emitCapturedStmtCall(CodeGenFunction &ParentCGF, EmittedClosureTy Cap,
+                     llvm::ArrayRef<llvm::Value *> Args) {
+  // Append the closure context to the argument.
+  SmallVector<llvm::Value *> EffectiveArgs;
+  EffectiveArgs.reserve(Args.size() + 1);
+  llvm::append_range(EffectiveArgs, Args);
+  EffectiveArgs.push_back(Cap.second);
+
+  return ParentCGF.Builder.CreateCall(Cap.first, EffectiveArgs);
+}
+
+llvm::CanonicalLoopInfo *
+CodeGenFunction::EmitOMPCollapsedCanonicalLoopNest(const Stmt *S, int Depth) {
+  assert(Depth == 1 && "Nested loops with OpenMPIRBuilder not yet implemented");
+
+  EmitStmt(S);
+  assert(OMPLoopNestStack.size() >= (size_t)Depth && "Found too few loops");
+
+  // The last added loop is the outermost one.
+  return OMPLoopNestStack.back();
+}
+
+void CodeGenFunction::EmitOMPCanonicalLoop(const OMPCanonicalLoop *S) {
+  const Stmt *SyntacticalLoop = S->getLoopStmt();
+  if (!getLangOpts().OpenMPIRBuilder) {
+    // Ignore if OpenMPIRBuilder is not enabled.
+    EmitStmt(SyntacticalLoop);
+    return;
+  }
+
+  LexicalScope ForScope(*this, S->getSourceRange());
+
+  // Emit init statements. The Distance/LoopVar funcs may reference variable
+  // declarations they contain.
+  const Stmt *BodyStmt;
+  if (const auto *For = dyn_cast<ForStmt>(SyntacticalLoop)) {
+    if (const Stmt *InitStmt = For->getInit())
+      EmitStmt(InitStmt);
+    BodyStmt = For->getBody();
+  } else if (const auto *RangeFor =
+                 dyn_cast<CXXForRangeStmt>(SyntacticalLoop)) {
+    if (const DeclStmt *RangeStmt = RangeFor->getRangeStmt())
+      EmitStmt(RangeStmt);
+    if (const DeclStmt *BeginStmt = RangeFor->getBeginStmt())
+      EmitStmt(BeginStmt);
+    if (const DeclStmt *EndStmt = RangeFor->getEndStmt())
+      EmitStmt(EndStmt);
+    if (const DeclStmt *LoopVarStmt = RangeFor->getLoopVarStmt())
+      EmitStmt(LoopVarStmt);
+    BodyStmt = RangeFor->getBody();
+  } else
+    llvm_unreachable("Expected for-stmt or range-based for-stmt");
+
+  // Emit closure for later use. By-value captures will be captured here.
+  const CapturedStmt *DistanceFunc = S->getDistanceFunc();
+  EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc);
+  const CapturedStmt *LoopVarFunc = S->getLoopVarFunc();
+  EmittedClosureTy LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc);
+
+  // Call the distance function to get the number of iterations of the loop to
+  // come.
+  QualType LogicalTy = DistanceFunc->getCapturedDecl()
+                           ->getParam(0)
+                           ->getType()
+                           .getNonReferenceType();
+  Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr");
+  emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()});
+  llvm::Value *DistVal = Builder.CreateLoad(CountAddr, ".count");
+
+  // Emit the loop structure.
+  llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+  auto BodyGen = [&, this](llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+                           llvm::Value *IndVar) {
+    Builder.restoreIP(CodeGenIP);
+
+    // Emit the loop body: Convert the logical iteration number to the loop
+    // variable and emit the body.
+    const DeclRefExpr *LoopVarRef = S->getLoopVarRef();
+    LValue LCVal = EmitLValue(LoopVarRef);
+    Address LoopVarAddress = LCVal.getAddress(*this);
+    emitCapturedStmtCall(*this, LoopVarClosure,
+                         {LoopVarAddress.getPointer(), IndVar});
+
+    RunCleanupsScope BodyScope(*this);
+    EmitStmt(BodyStmt);
+  };
+  llvm::CanonicalLoopInfo *CL =
+      OMPBuilder.createCanonicalLoop(Builder, BodyGen, DistVal);
+
+  // Finish up the loop.
+  Builder.restoreIP(CL->getAfterIP());
+  ForScope.ForceCleanup();
+
+  // Remember the CanonicalLoopInfo for parent AST nodes consuming it.
+  OMPLoopNestStack.push_back(CL);
+}
+
 void CodeGenFunction::EmitOMPInnerLoop(
     const OMPExecutableDirective &S, bool RequiresCleanup, const Expr *LoopCond,
     const Expr *IncExpr,
@@ -1875,6 +1994,7 @@
   const CapturedStmt *ICS = OMPED.getInnermostCapturedStmt();
   const Stmt *SS = ICS->getCapturedStmt();
   const AttributedStmt *AS = dyn_cast_or_null<AttributedStmt>(SS);
+  OMPLoopNestStack.clear();
   if (AS)
     LoopStack.push(CondBlock, CGM.getContext(), CGM.getCodeGenOpts(),
                    AS->getAttrs(), SourceLocToDebugLoc(R.getBegin()),
@@ -2424,6 +2544,7 @@
   llvm::BasicBlock *CondBlock = createBasicBlock("omp.dispatch.cond");
   EmitBlock(CondBlock);
   const SourceRange R = S.getSourceRange();
+  OMPLoopNestStack.clear();
   LoopStack.push(CondBlock, SourceLocToDebugLoc(R.getBegin()),
                  SourceLocToDebugLoc(R.getEnd()));
 
@@ -2507,6 +2628,7 @@
   }
 
   EmitBranch(CondBlock);
+  OMPLoopNestStack.clear();
   LoopStack.pop();
   // Emit the fall-through block.
   EmitBlock(LoopExit.getBlock());
@@ -3349,10 +3471,38 @@
   return HasLastprivates;
 }
 
+static bool isSupportedByOpenMPIRBuilder(const OMPForDirective &S) {
+  if (S.hasCancel())
+    return false;
+  for (OMPClause *C : S.clauses())
+    if (!isa<OMPNowaitClause>(C))
+      return false;
+
+  return true;
+}
+
 void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
   bool HasLastprivates = false;
-  auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF,
-                                          PrePostActionTy &) {
+  bool UseOMPIRBuilder =
+      CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S);
+  auto &&CodeGen = [this, &S, &HasLastprivates,
+                    UseOMPIRBuilder](CodeGenFunction &CGF, PrePostActionTy &) {
+    // Use the OpenMPIRBuilder if enabled.
+    if (UseOMPIRBuilder) {
+      // Emit the associated statement and get its loop representation.
+      const Stmt *Inner = S.getRawStmt();
+      llvm::CanonicalLoopInfo *CLI =
+          EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
+
+      bool NeedsBarrer = !S.getSingleClause<OMPNowaitClause>();
+      llvm::OpenMPIRBuilder &OMPBuilder =
+          CGM.getOpenMPRuntime().getOMPBuilder();
+      llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
+          AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
+      OMPBuilder.createWorkshareLoop(Builder, CLI, AllocaIP, NeedsBarrer);
+      return;
+    }
+
     HasLastprivates = emitWorksharingDirective(CGF, S, S.hasCancel());
   };
   {
@@ -3363,9 +3513,11 @@
                                                 S.hasCancel());
   }
 
-  // Emit an implicit barrier at the end.
-  if (!S.getSingleClause<OMPNowaitClause>() || HasLastprivates)
-    CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_for);
+  if (!UseOMPIRBuilder) {
+    // Emit an implicit barrier at the end.
+    if (!S.getSingleClause<OMPNowaitClause>() || HasLastprivates)
+      CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_for);
+  }
   // Check for outer lastprivate conditional update.
   checkForLastprivateConditionalUpdate(*this, S);
 }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -50,6 +50,7 @@
 class SwitchInst;
 class Twine;
 class Value;
+class CanonicalLoopInfo;
 }
 
 namespace clang {
@@ -276,6 +277,20 @@
   // because of jumps.
   VarBypassDetector Bypasses;
 
+  /// List of recently emitted OMPCanonicalLoops.
+  ///
+  /// Since OMPCanonicalLoops are nested inside other statements (in particular
+  /// CapturedStmt generated by OMPExecutableDirective and non-perfectly nested
+  /// loops), we cannot directly call OMPEmitOMPCanonicalLoop and receive its
+  /// llvm::CanonicalLoopInfo. Instead, we call EmitStmt and any
+  /// OMPEmitOMPCanonicalLoop called by it will add its CanonicalLoopInfo to
+  /// this stack when done. Entering a new loop requires clearing this list; it
+  /// either means we start parsing an new loop nest (in which case the previous
+  /// loop nest goes out of scope) or a second loop in the same level in which
+  /// case it would be ambiguous into which of the two (or more) loops the loop
+  /// nest would extend.
+  SmallVector<llvm::CanonicalLoopInfo *, 4> OMPLoopNestStack;
+
   // CodeGen lambda for loops and support for ordered clause
   typedef llvm::function_ref<void(CodeGenFunction &, const OMPLoopDirective &,
                                   JumpDest)>
@@ -3499,6 +3514,18 @@
   static void EmitOMPTargetTeamsDistributeParallelForDeviceFunction(
       CodeGenModule &CGM, StringRef ParentName,
       const OMPTargetTeamsDistributeParallelForDirective &S);
+
+  /// Emit the Stmt \p S and return its topmost canonical loop, if any.
+  /// TODO: The \p Depth paramter is not yet implemented and must be 1. In the
+  /// future it is meant to be the number of loops expected in the loop nests
+  /// (usually specified by the "collapse" clause) that are collapsed to a
+  /// single loop by this function.
+  llvm::CanonicalLoopInfo *EmitOMPCollapsedCanonicalLoopNest(const Stmt *S,
+                                                             int Depth);
+
+  /// Emit an OMPCanonicalLoop using the OpenMPIRBuilder.
+  void EmitOMPCanonicalLoop(const OMPCanonicalLoop *S);
+
   /// Emit inner loop of the worksharing/simd construct.
   ///
   /// \param S Directive, for which the inner loop must be emitted.
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -91,8 +91,8 @@
   // seems to be a reasonable spot. We do it here, as opposed to the deletion
   // time of the CodeGenModule, because we have to ensure the IR has not yet
   // been "emitted" to the outside, thus, modifications are still sensible.
-  if (CGM.getLangOpts().OpenMPIRBuilder)
-    CGM.getOpenMPRuntime().getOMPBuilder().finalize();
+  if (CGM.getLangOpts().OpenMPIRBuilder && CurFn)
+    CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn);
 }
 
 // Map the LangOption for exception behavior into
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2529,7 +2529,15 @@
       // the captured region. Code elsewhere assumes that any FunctionScopeInfo
       // should have at least one compound statement scope within it.
       ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false);
-      AssociatedStmt = (Sema::CompoundScopeRAII(Actions), ParseStatement());
+      {
+        Sema::CompoundScopeRAII Scope(Actions);
+        AssociatedStmt = ParseStatement();
+
+        if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
+            getLangOpts().OpenMPIRBuilder)
+          AssociatedStmt =
+              Actions.ActOnOpenMPCanonicalLoop(AssociatedStmt.get());
+      }
       AssociatedStmt = Actions.ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
     } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
                DKind == OMPD_target_exit_data) {
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1448,6 +1448,7 @@
   case Stmt::OMPMasterTaskLoopDirectiveClass:
   case Stmt::OMPMasterTaskLoopSimdDirectiveClass:
   case Stmt::OMPOrderedDirectiveClass:
+  case Stmt::OMPCanonicalLoopClass:
   case Stmt::OMPParallelDirectiveClass:
   case Stmt::OMPParallelForDirectiveClass:
   case Stmt::OMPParallelForSimdDirectiveClass:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17263,18 +17263,17 @@
 
 
 /// Capture the given variable in the captured region.
-static bool captureInCapturedRegion(CapturedRegionScopeInfo *RSI,
-                                    VarDecl *Var,
-                                    SourceLocation Loc,
-                                    const bool BuildAndDiagnose,
-                                    QualType &CaptureType,
-                                    QualType &DeclRefType,
-                                    const bool RefersToCapturedVariable,
-                                    Sema &S, bool Invalid) {
+static bool captureInCapturedRegion(
+    CapturedRegionScopeInfo *RSI, VarDecl *Var, SourceLocation Loc,
+    const bool BuildAndDiagnose, QualType &CaptureType, QualType &DeclRefType,
+    const bool RefersToCapturedVariable, Sema::TryCaptureKind Kind,
+    bool IsTopScope, Sema &S, bool Invalid) {
   // By default, capture variables by reference.
   bool ByRef = true;
-  // Using an LValue reference type is consistent with Lambdas (see below).
-  if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) {
+  if (IsTopScope && Kind != Sema::TryCapture_Implicit) {
+    ByRef = (Kind == Sema::TryCapture_ExplicitByRef);
+  } else if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) {
+    // Using an LValue reference type is consistent with Lambdas (see below).
     if (S.isOpenMPCapturedDecl(Var)) {
       bool HasConst = DeclRefType.isConstQualified();
       DeclRefType = DeclRefType.getUnqualifiedType();
@@ -17620,9 +17619,9 @@
                                DeclRefType, Nested, *this, Invalid);
       Nested = true;
     } else if (CapturedRegionScopeInfo *RSI = dyn_cast<CapturedRegionScopeInfo>(CSI)) {
-      Invalid = !captureInCapturedRegion(RSI, Var, ExprLoc, BuildAndDiagnose,
-                                         CaptureType, DeclRefType, Nested,
-                                         *this, Invalid);
+      Invalid = !captureInCapturedRegion(
+          RSI, Var, ExprLoc, BuildAndDiagnose, CaptureType, DeclRefType, Nested,
+          Kind, /*IsTopScope*/ I == N - 1, *this, Invalid);
       Nested = true;
     } else {
       LambdaScopeInfo *LSI = cast<LambdaScopeInfo>(CSI);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -3767,6 +3767,9 @@
       Visit(E->getBase());
     }
   }
+
+  void VisitOMPCanonicalLoop(OMPCanonicalLoop *S) { VisitStmt(S); }
+
   void VisitOMPExecutableDirective(OMPExecutableDirective *S) {
     for (OMPClause *C : S->clauses()) {
       // Skip analysis of arguments of implicitly defined firstprivate clause
@@ -5130,6 +5133,350 @@
   }
 }
 
+namespace {
+/// Rewrite statements and expressions for Sema \p Actions CurContext.
+/// Used to capture variable references if already parsed statements/expressions
+/// into a CapturedStatement.
+class CaptureVars : public TreeTransform<CaptureVars> {
+  using BaseTransform = TreeTransform<CaptureVars>;
+
+public:
+  CaptureVars(Sema &Actions) : BaseTransform(Actions) {}
+
+  bool AlwaysRebuild() { return true; }
+};
+} // namespace
+
+/// Create a closure that computes the number of iterations of a loop.
+///
+/// \param Actions   The Sema object.
+/// \param LogicalTy Type for the logical iteration number.
+/// \param Rel       Comparison operator of the loop condition.
+/// \param StartExpr Value of the loop counter at the first iteration.
+/// \param StopExpr  Expression the loop counter is compared against in the loop
+/// condition. \param Step      Amount of increment after each iteration.
+///
+/// \return Closure (CapturedStmt) of the distance calculation.
+static CapturedStmt *buildDistanceFunc(Sema &Actions, QualType LogicalTy,
+                                       BinaryOperator::Opcode Rel,
+                                       Expr *StartExpr, Expr *StopExpr,
+                                       Expr *Step) {
+  ASTContext &Ctx = Actions.getASTContext();
+  TypeSourceInfo *LogicalTSI = Ctx.getTrivialTypeSourceInfo(LogicalTy);
+
+  // Captured regions currently don't support return values, we use an
+  // out-parameter instead. All inputs are implicit captures.
+  // TODO: Instead of capturing each DeclRefExpr occurring in
+  // StartExpr/StopExpr/Step, these could also be passed as a value capture.
+  QualType ResultTy = Ctx.getLValueReferenceType(LogicalTy);
+  Sema::CapturedParamNameType Params[] = {{"Distance", ResultTy},
+                                          {StringRef(), QualType()}};
+  Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params);
+
+  Expr *Body;
+  {
+    Sema::CompoundScopeRAII CompoundScope(Actions);
+    CapturedDecl *CS = cast<CapturedDecl>(Actions.CurContext);
+
+    // Get the LValue expression for the result.
+    ImplicitParamDecl *DistParam = CS->getParam(0);
+    DeclRefExpr *DistRef = Actions.BuildDeclRefExpr(
+        DistParam, LogicalTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr);
+
+    // Capture all referenced variable references.
+    CaptureVars Recap(Actions);
+    Expr *NewStart = AssertSuccess(Recap.TransformExpr(StartExpr));
+    Expr *NewStop = AssertSuccess(Recap.TransformExpr(StopExpr));
+    Expr *NewStep = AssertSuccess(Recap.TransformExpr(Step));
+
+    IntegerLiteral *Zero = IntegerLiteral::Create(
+        Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), 0), LogicalTy, {});
+    Expr *Dist;
+    if (Rel == BO_NE) {
+      // When using a != comparison, the increment can be +1 or -1. This can be
+      // dynamic at runtime, so we need to check for the direction.
+      Expr *IsNegStep =
+          AssertSuccess(Actions.BuildBinOp(nullptr, {}, BO_LT, NewStep, Zero));
+
+      // Positive increment.
+      Expr *ForwardRange = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStop, NewStart));
+      ForwardRange = AssertSuccess(
+          Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, ForwardRange));
+      Expr *ForwardDist = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Div, ForwardRange, NewStep));
+
+      // Negative increment.
+      Expr *BackwardRange = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStart, NewStop));
+      BackwardRange = AssertSuccess(
+          Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, BackwardRange));
+      Expr *NegIncAmount =
+          AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Minus, NewStep));
+      Expr *BackwardDist = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Div, BackwardRange, NegIncAmount));
+
+      // Use the appropriate case.
+      Dist = AssertSuccess(Actions.ActOnConditionalOp(
+          {}, {}, IsNegStep, BackwardDist, ForwardDist));
+    } else {
+      assert((Rel == BO_LT || Rel == BO_LE || Rel == BO_GE || Rel == BO_GT) &&
+             "Expected one of these relational operators");
+
+      // We can derive the direction from any other comparison operator. It is
+      // non well-formed OpenMP if Step increments/decrements in the other
+      // directions. Whether at least the first iteration passes the loop
+      // condition.
+      Expr *HasAnyIteration = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, Rel, NewStart, NewStop));
+
+      // Compute the range between first and last counter value.
+      Expr *Range;
+      if (Rel == BO_GE || Rel == BO_GT)
+        Range = AssertSuccess(
+            Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStart, NewStop));
+      else
+        Range = AssertSuccess(
+            Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStop, NewStart));
+
+      // Ensure unsigned range space.
+      Range =
+          AssertSuccess(Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, Range));
+
+      if (Rel == BO_LE || Rel == BO_GE) {
+        // Add one to the range if the relational operator is inclusive.
+        Range =
+            AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_PreInc, Range));
+      }
+
+      // Divide by the absolute step amount.
+      if (Rel == BO_GE || Rel == BO_GT)
+        NewStep =
+            AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Minus, NewStep));
+      Dist = AssertSuccess(
+          Actions.BuildBinOp(nullptr, {}, BO_Div, Range, NewStep));
+
+      // If there is not at least one iteration, the range contains garbage. Fix
+      // to zero in this case.
+      Dist = AssertSuccess(
+          Actions.ActOnConditionalOp({}, {}, HasAnyIteration, Dist, Zero));
+    }
+
+    // Assign the result to the out-parameter.
+    Body = AssertSuccess(Actions.BuildBinOp(Actions.getCurScope(), {},
+                                            BO_Assign, DistRef, Dist));
+  }
+  return cast<CapturedStmt>(
+      AssertSuccess(Actions.ActOnCapturedRegionEnd(Body)));
+}
+
+/// Create a closure that computes the loop variable from the logical iteration
+/// number.
+///
+/// \param Actions   The Sema object.
+/// \param LoopVarTy Type for the loop variable used for result value.
+/// \param LogicalTy Type for the logical iteration number.
+/// \param StartExpr Value of the loop counter at the first iteration.
+/// \param Step      Amount of increment after each iteration.
+/// \param Deref     Whether the loop variable is a dereference of the loop
+/// counter variable.
+///
+/// \return Closure (CapturedStmt) of the loop value calculation.
+static CapturedStmt *buildLoopVarFunc(Sema &Actions, QualType LoopVarTy,
+                                      QualType LogicalTy,
+                                      DeclRefExpr *StartExpr, Expr *Step,
+                                      bool Deref) {
+  ASTContext &Ctx = Actions.getASTContext();
+
+  // Pass the result as an out-parameter. Passing as return value would require
+  // the OpenMPIRBuilder to know additional C/C++ semantics, such as how to
+  // invoke a copy constructor.
+  QualType TargetParamTy = Ctx.getLValueReferenceType(LoopVarTy);
+  Sema::CapturedParamNameType Params[] = {{"LoopVar", TargetParamTy},
+                                          {"Logical", LogicalTy},
+                                          {StringRef(), QualType()}};
+  Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params);
+
+  // Capture the initial iterator which represents the LoopVar value at the
+  // zero's logical iteration. Since the original ForStmt/CXXForRangeStmt update
+  // it in every iteration, capture it by value before it is modified.
+  VarDecl *StartVar = cast<VarDecl>(StartExpr->getDecl());
+  bool Invalid = Actions.tryCaptureVariable(StartVar, {},
+                                            Sema::TryCapture_ExplicitByVal, {});
+  (void)Invalid;
+  assert(!Invalid && "Expecting capture-by-value to work.");
+
+  Expr *Body;
+  {
+    Sema::CompoundScopeRAII CompoundScope(Actions);
+    auto *CS = cast<CapturedDecl>(Actions.CurContext);
+
+    ImplicitParamDecl *TargetParam = CS->getParam(0);
+    DeclRefExpr *TargetRef = Actions.BuildDeclRefExpr(
+        TargetParam, LoopVarTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr);
+    ImplicitParamDecl *IndvarParam = CS->getParam(1);
+    DeclRefExpr *LogicalRef = Actions.BuildDeclRefExpr(
+        IndvarParam, LogicalTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr);
+
+    // Capture the Start expression.
+    CaptureVars Recap(Actions);
+    Expr *NewStart = AssertSuccess(Recap.TransformExpr(StartExpr));
+    Expr *NewStep = AssertSuccess(Recap.TransformExpr(Step));
+
+    Expr *Skip = AssertSuccess(
+        Actions.BuildBinOp(nullptr, {}, BO_Mul, NewStep, LogicalRef));
+    // TODO: Explicitly cast to the iterator's difference_type instead of
+    // relying on implicit conversion.
+    Expr *Advanced =
+        AssertSuccess(Actions.BuildBinOp(nullptr, {}, BO_Add, NewStart, Skip));
+
+    if (Deref) {
+      // For range-based for-loops convert the loop counter value to a concrete
+      // loop variable value by dereferencing the iterator.
+      Advanced =
+          AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Deref, Advanced));
+    }
+
+    // Assign the result to the output parameter.
+    Body = AssertSuccess(Actions.BuildBinOp(Actions.getCurScope(), {},
+                                            BO_Assign, TargetRef, Advanced));
+  }
+  return cast<CapturedStmt>(
+      AssertSuccess(Actions.ActOnCapturedRegionEnd(Body)));
+}
+
+StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
+  ASTContext &Ctx = getASTContext();
+
+  // Extract the common elements of ForStmt and CXXForRangeStmt:
+  // Loop variable, repeat condition, increment
+  Expr *Cond, *Inc;
+  VarDecl *CounterDecl, *LVDecl;
+  if (auto *For = dyn_cast<ForStmt>(AStmt)) {
+    Stmt *Init = For->getInit();
+    if (auto *LCVarDeclStmt = dyn_cast<DeclStmt>(Init)) {
+      // For statement declares loop variable.
+      CounterDecl = cast<VarDecl>(LCVarDeclStmt->getSingleDecl());
+    } else if (auto *LCAssign = dyn_cast<BinaryOperator>(Init)) {
+      // For statement reuses variable.
+      assert(LCAssign->getOpcode() == BO_Assign &&
+             "init part must be a loop variable assignment");
+      auto *CounterRef = cast<DeclRefExpr>(LCAssign->getLHS());
+      CounterDecl = cast<VarDecl>(CounterRef->getDecl());
+    } else
+      llvm_unreachable("Cannot determine loop variable");
+    LVDecl = CounterDecl;
+
+    Cond = For->getCond();
+    Inc = For->getInc();
+  } else if (auto *RangeFor = dyn_cast<CXXForRangeStmt>(AStmt)) {
+    DeclStmt *BeginStmt = RangeFor->getBeginStmt();
+    CounterDecl = cast<VarDecl>(BeginStmt->getSingleDecl());
+    LVDecl = RangeFor->getLoopVariable();
+
+    Cond = RangeFor->getCond();
+    Inc = RangeFor->getInc();
+  } else
+    llvm_unreachable("unhandled kind of loop");
+
+  QualType CounterTy = CounterDecl->getType();
+  QualType LVTy = LVDecl->getType();
+
+  // Analyze the loop condition.
+  Expr *LHS, *RHS;
+  BinaryOperator::Opcode CondRel;
+  Cond = Cond->IgnoreImplicit();
+  if (auto *CondBinExpr = dyn_cast<BinaryOperator>(Cond)) {
+    LHS = CondBinExpr->getLHS();
+    RHS = CondBinExpr->getRHS();
+    CondRel = CondBinExpr->getOpcode();
+  } else if (auto *CondCXXOp = dyn_cast<CXXOperatorCallExpr>(Cond)) {
+    assert(CondCXXOp->getOperator() == OO_ExclaimEqual &&
+           "Expected != loop condition for iterator-based loops");
+    assert(CondCXXOp->getNumArgs() == 2 && "Comparison should have 2 operands");
+    LHS = CondCXXOp->getArg(0);
+    RHS = CondCXXOp->getArg(1);
+    CondRel = BO_NE;
+  } else
+    llvm_unreachable("unexpected loop condition");
+
+  // Normalize such that the loop counter is on the LHS.
+  if (!isa<DeclRefExpr>(LHS->IgnoreImplicit()) ||
+      cast<DeclRefExpr>(LHS->IgnoreImplicit())->getDecl() != CounterDecl) {
+    std::swap(LHS, RHS);
+    CondRel = BinaryOperator::reverseComparisonOp(CondRel);
+  }
+  auto *CounterRef = cast<DeclRefExpr>(LHS->IgnoreImplicit());
+
+  // Decide the bit width for the logical iteration counter. By default use the
+  // unsigned ptrdiff_t integer size (for iterators and pointers).
+  // TODO: For iterators, use iterator::difference_type,
+  // std::iterator_traits<>::difference_type or decltype(it - end).
+  QualType LogicalTy = Ctx.getUnsignedPointerDiffType();
+  if (CounterTy->isIntegerType()) {
+    unsigned BitWidth = Ctx.getIntWidth(CounterTy);
+    LogicalTy = Ctx.getIntTypeForBitwidth(BitWidth, false);
+  }
+
+  // Analyze the loop increment.
+  Expr *Step;
+  if (auto *IncUn = dyn_cast<UnaryOperator>(Inc)) {
+    int Direction;
+    switch (IncUn->getOpcode()) {
+    case UO_PreInc:
+    case UO_PostInc:
+      Direction = 1;
+      break;
+    case UO_PreDec:
+    case UO_PostDec:
+      Direction = -1;
+      break;
+    default:
+      llvm_unreachable("unhandled unary increment operator");
+    }
+    Step = IntegerLiteral::Create(
+        Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), Direction), LogicalTy, {});
+  } else if (auto *IncBin = dyn_cast<BinaryOperator>(Inc)) {
+    if (IncBin->getOpcode() == BO_AddAssign) {
+      Step = IncBin->getRHS();
+    } else if (IncBin->getOpcode() == BO_SubAssign) {
+      Step =
+          AssertSuccess(BuildUnaryOp(nullptr, {}, UO_Minus, IncBin->getRHS()));
+    } else
+      llvm_unreachable("unhandled binary increment operator");
+  } else if (auto *CondCXXOp = dyn_cast<CXXOperatorCallExpr>(Inc)) {
+    switch (CondCXXOp->getOperator()) {
+    case OO_PlusPlus:
+      Step = IntegerLiteral::Create(
+          Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), 1), LogicalTy, {});
+      break;
+    case OO_MinusMinus:
+      Step = IntegerLiteral::Create(
+          Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), -1), LogicalTy, {});
+      break;
+    case OO_PlusEqual:
+      Step = CondCXXOp->getArg(1);
+      break;
+    case OO_MinusEqual:
+      Step = AssertSuccess(
+          BuildUnaryOp(nullptr, {}, UO_Minus, CondCXXOp->getArg(1)));
+      break;
+    default:
+      llvm_unreachable("unhandled overloaded increment operator");
+    }
+  } else
+    llvm_unreachable("unknown increment expression");
+
+  CapturedStmt *DistanceFunc =
+      buildDistanceFunc(*this, LogicalTy, CondRel, LHS, RHS, Step);
+  CapturedStmt *LoopVarFunc = buildLoopVarFunc(
+      *this, LVTy, LogicalTy, CounterRef, Step, isa<CXXForRangeStmt>(AStmt));
+  DeclRefExpr *LVRef = BuildDeclRefExpr(LVDecl, LVDecl->getType(), VK_LValue,
+                                        {}, nullptr, nullptr, {}, nullptr);
+  return OMPCanonicalLoop::create(getASTContext(), AStmt, DistanceFunc,
+                                  LoopVarFunc, LVRef);
+}
+
 StmtResult Sema::ActOnOpenMPExecutableDirective(
     OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
     OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
@@ -7947,6 +8294,8 @@
   // OpenMP [2.9.1, Canonical Loop Form]
   //   for (init-expr; test-expr; incr-expr) structured-block
   //   for (range-decl: range-expr) structured-block
+  if (auto *CanonLoop = dyn_cast_or_null<OMPCanonicalLoop>(S))
+    S = CanonLoop->getLoopStmt();
   auto *For = dyn_cast_or_null<ForStmt>(S);
   auto *CXXFor = dyn_cast_or_null<CXXForRangeStmt>(S);
   // Ranged for is supported only in OpenMP 5.0.
@@ -8280,6 +8629,17 @@
   return PostUpdate;
 }
 
+static Stmt *findLoopBody(Stmt *L) {
+  if (auto *CanonLoop = dyn_cast_or_null<OMPCanonicalLoop>(L))
+    L = CanonLoop->getLoopStmt();
+  if (auto *For = dyn_cast<ForStmt>(L))
+    return For->getBody();
+
+  assert(isa<CXXForRangeStmt>(L) &&
+         "Expected canonical for or range-based for loops.");
+  return cast<CXXForRangeStmt>(L)->getBody();
+}
+
 /// Called on a for stmt to check itself and nested loops (if any).
 /// \return Returns 0 if one of the collapsed stmts is not canonical for loop,
 /// number of collapsed loops otherwise.
@@ -8340,13 +8700,7 @@
     // All loops associated with the construct must be perfectly nested; that
     // is, there must be no intervening code nor any OpenMP directive between
     // any two loops.
-    if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
-      CurStmt = For->getBody();
-    } else {
-      assert(isa<CXXForRangeStmt>(CurStmt) &&
-             "Expected canonical for or range-based for loops.");
-      CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
-    }
+    CurStmt = findLoopBody(CurStmt);
     CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
         CurStmt, SemaRef.LangOpts.OpenMP >= 50);
   }
@@ -8368,13 +8722,7 @@
     // All loops associated with the construct must be perfectly nested; that
     // is, there must be no intervening code nor any OpenMP directive between
     // any two loops.
-    if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
-      CurStmt = For->getBody();
-    } else {
-      assert(isa<CXXForRangeStmt>(CurStmt) &&
-             "Expected canonical for or range-based for loops.");
-      CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
-    }
+    CurStmt = findLoopBody(CurStmt);
     CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
         CurStmt, SemaRef.LangOpts.OpenMP >= 50);
   }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -8314,6 +8314,15 @@
 //===----------------------------------------------------------------------===//
 // OpenMP directive transformation
 //===----------------------------------------------------------------------===//
+
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPCanonicalLoop(OMPCanonicalLoop *L) {
+  // The OMPCanonicalLoop will be recreated when transforming the loop-associted
+  // directive.
+  return getDerived().TransformStmt(L->getLoopStmt());
+}
+
 template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPExecutableDirective(
     OMPExecutableDirective *D) {
@@ -8350,6 +8359,9 @@
       else
         CS = D->getInnermostCapturedStmt()->getCapturedStmt();
       Body = getDerived().TransformStmt(CS);
+      if (Body.isUsable() && isOpenMPLoopDirective(D->getDirectiveKind()) &&
+          getSema().getLangOpts().OpenMPIRBuilder)
+        Body = getSema().ActOnOpenMPCanonicalLoop(Body.get());
     }
     AssociatedStmt =
         getDerived().getSema().ActOnOpenMPRegionEnd(Body, TClauses);
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2273,6 +2273,12 @@
 // OpenMP Directives.
 //===----------------------------------------------------------------------===//
 
+void ASTStmtReader::VisitOMPCanonicalLoop(OMPCanonicalLoop *S) {
+  VisitStmt(S);
+  for (Stmt *&SubStmt : S->SubStmts)
+    SubStmt = Record.readSubStmt();
+}
+
 void ASTStmtReader::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
   Record.readOMPChildren(E->Data);
   E->setLocStart(readSourceLocation());
@@ -3130,6 +3136,10 @@
                                               nullptr);
       break;
 
+    case STMT_OMP_CANONICAL_LOOP:
+      S = OMPCanonicalLoop::createEmpty(Context);
+      break;
+
     case STMT_OMP_PARALLEL_DIRECTIVE:
       S =
         OMPParallelDirective::CreateEmpty(Context,
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2170,6 +2170,13 @@
 // OpenMP Directives.
 //===----------------------------------------------------------------------===//
 
+void ASTStmtWriter::VisitOMPCanonicalLoop(OMPCanonicalLoop *S) {
+  VisitStmt(S);
+  for (Stmt *SubStmt : S->SubStmts)
+    Record.AddStmt(SubStmt);
+  Code = serialization::STMT_OMP_CANONICAL_LOOP;
+}
+
 void ASTStmtWriter::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
   Record.writeOMPChildren(E->Data);
   Record.AddSourceLocation(E->getBeginLoc());
diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp
@@ -0,0 +1,147 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct MyIterator {
+  MyIterator(unsigned pos);
+  MyIterator(const MyIterator &other);
+  const MyIterator &operator=(const MyIterator &that);
+  MyIterator &operator++();
+  int operator-(const MyIterator &that) const;
+  MyIterator &operator+=(unsigned a);
+  MyIterator operator+(unsigned a) const;
+  bool operator==(const MyIterator &that) const;
+  bool operator!=(const MyIterator &that) const;
+  unsigned operator*() const;
+};
+
+extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
+#pragma omp for
+  for (MyIterator it = MyIterator(7); it != MyIterator(41); ++it) {
+    unsigned i = *it;
+    a[i] = b[i] * c[i];
+  }
+}
+
+#endif // HEADER
+// CHECK-LABEL: define {{[^@]+}}@workshareloop_iterator
+// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]]) [[ATTR0:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[IT:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1
+// CHECK-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_STRIDE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1Ej(%struct.MyIterator* nonnull dereferenceable(1) [[IT]], i32 7)
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store %struct.MyIterator* [[IT]], %struct.MyIterator** [[TMP0]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[IT]])
+// CHECK-NEXT:    call void @__captured_stmt(i64* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]])
+// CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i64, i64* [[DOTCOUNT_ADDR]], align 8
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
+// CHECK:       omp_loop.preheader:
+// CHECK-NEXT:    store i64 0, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i64 [[TMP2]], i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    store i64 1, i64* [[P_STRIDE]], align 8
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_8u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], [[TMP3]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
+// CHECK:       omp_loop.header:
+// CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
+// CHECK:       omp_loop.cond:
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP6]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp_loop.body:
+// CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP3]]
+// CHECK-NEXT:    call void @__captured_stmt.1(%struct.MyIterator* [[IT]], i64 [[TMP7]], %struct.anon.0* [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[IT]])
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP12]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP10]], [[TMP13]]
+// CHECK-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX5]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT]] = add nuw i64 [[OMP_LOOP_IV]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER]]
+// CHECK:       omp_loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER:%.*]]
+// CHECK:       omp_loop.after:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK-SAME: (i64* nonnull align 8 dereferenceable(8) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR2:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DISTANCE_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    store i64* [[DISTANCE]], i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1Ej(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]], i32 41)
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP1]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratormiERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[TMP2]])
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[CALL]] to i64
+// CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[CONV]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64*, i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[DIV]], i64* [[TMP3]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK-SAME: (%struct.MyIterator* nonnull align 1 dereferenceable(1) [[LOOPVAR:%.*]], i64 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LOOPVAR_ADDR:%.*]] = alloca %struct.MyIterator*, align 8
+// CHECK-NEXT:    [[LOGICAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    store %struct.MyIterator* [[LOOPVAR]], %struct.MyIterator** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[LOGICAL]], i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP2]]
+// CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[MUL]] to i32
+// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[REF_TMP]], %struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], i32 [[CONV]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call nonnull align 1 dereferenceable(1) %struct.MyIterator* @_ZN10MyIteratoraSERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP3]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// CHECK-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct MyIterator {
+  MyIterator(unsigned pos);
+  MyIterator(const MyIterator &other);
+  const MyIterator &operator=(const MyIterator &that);
+  MyIterator &operator++();
+  int operator-(const MyIterator &that) const;
+  MyIterator &operator+=(unsigned a);
+  MyIterator operator+(unsigned a) const;
+  bool operator==(const MyIterator &that) const;
+  bool operator!=(const MyIterator &that) const;
+  unsigned operator*() const;
+};
+
+struct MyRange {
+  MyRange(int n);
+
+  MyIterator begin();
+  MyIterator end();
+};
+
+extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
+#pragma omp for
+  for (unsigned i : MyRange(42)) {
+    a[i] = b[i] * c[i];
+  }
+}
+
+#endif // HEADER
+// CHECK-LABEL: define {{[^@]+}}@workshareloop_rangefor
+// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]]) [[ATTR0:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[__RANGE2:%.*]] = alloca %struct.MyRange*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYRANGE:%.*]], align 1
+// CHECK-NEXT:    [[__BEGIN2:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    [[__END2:%.*]] = alloca [[STRUCT_MYITERATOR]], align 1
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1
+// CHECK-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[P_STRIDE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN7MyRangeC1Ei(%struct.MyRange* nonnull dereferenceable(1) [[REF_TMP]], i32 42)
+// CHECK-NEXT:    store %struct.MyRange* [[REF_TMP]], %struct.MyRange** [[__RANGE2]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.MyRange*, %struct.MyRange** [[__RANGE2]], align 8
+// CHECK-NEXT:    call void @_ZN7MyRange5beginEv(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[__BEGIN2]], %struct.MyRange* nonnull dereferenceable(1) [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = load %struct.MyRange*, %struct.MyRange** [[__RANGE2]], align 8
+// CHECK-NEXT:    call void @_ZN7MyRange3endEv(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[__END2]], %struct.MyRange* nonnull dereferenceable(1) [[TMP1]])
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[__BEGIN2]])
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store %struct.MyIterator* [[__BEGIN2]], %struct.MyIterator** [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 1
+// CHECK-NEXT:    store %struct.MyIterator* [[__END2]], %struct.MyIterator** [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP4]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[__BEGIN2]])
+// CHECK-NEXT:    call void @__captured_stmt(i64* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]])
+// CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i64, i64* [[DOTCOUNT_ADDR]], align 8
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
+// CHECK:       omp_loop.preheader:
+// CHECK-NEXT:    store i64 0, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i64 [[TMP5]], i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    store i64 1, i64* [[P_STRIDE]], align 8
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_8u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
+// CHECK:       omp_loop.header:
+// CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
+// CHECK:       omp_loop.cond:
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP9]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp_loop.body:
+// CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP6]]
+// CHECK-NEXT:    call void @__captured_stmt.1(i32* [[I]], i64 [[TMP10]], %struct.anon.0* [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[TMP11:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP12]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP13]], [[TMP16]]
+// CHECK-NEXT:    [[TMP17:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP18]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP17]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX5]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT]] = add nuw i64 [[OMP_LOOP_IV]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER]]
+// CHECK:       omp_loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER:%.*]]
+// CHECK:       omp_loop.after:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK-SAME: (i64* nonnull align 8 dereferenceable(8) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR2:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DISTANCE_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    store i64* [[DISTANCE]], i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP3]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratormiERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP2]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[TMP4]])
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[CALL]] to i64
+// CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[CONV]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64*, i64** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[DIV]], i64* [[TMP5]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i64 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LOOPVAR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[LOGICAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1
+// CHECK-NEXT:    store i32* [[LOOPVAR]], i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[LOGICAL]], i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP2]]
+// CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[MUL]] to i32
+// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[REF_TMP]], %struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], i32 [[CONV]])
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[TMP3]], align 4
+// CHECK-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned.c b/clang/test/OpenMP/irbuilder_for_unsigned.c
new file mode 100644
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_unsigned.c
@@ -0,0 +1,147 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) {
+#pragma omp for
+  for (unsigned i = 33; i < 32000000; i += 7) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+#endif // HEADER
+// CHECK-LABEL: define {{[^@]+}}@workshareloop_unsigned
+// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]], float* [[D:%.*]]) [[ATTR0:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 4
+// CHECK-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK-NEXT:    store i32 33, i32* [[I]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store i32* [[I]], i32** [[TMP0]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP1]], align 4
+// CHECK-NEXT:    call void @__captured_stmt(i32* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]])
+// CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i32, i32* [[DOTCOUNT_ADDR]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
+// CHECK:       omp_loop.preheader:
+// CHECK-NEXT:    store i32 0, i32* [[P_LOWERBOUND]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i32 [[TMP3]], i32* [[P_UPPERBOUND]], align 4
+// CHECK-NEXT:    store i32 1, i32* [[P_STRIDE]], align 4
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
+// CHECK:       omp_loop.header:
+// CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
+// CHECK:       omp_loop.cond:
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i32 [[OMP_LOOP_IV]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp_loop.body:
+// CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP4]]
+// CHECK-NEXT:    call void @__captured_stmt.1(i32* [[I]], i32 [[TMP8]], %struct.anon.0* [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[TMP9:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP12]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP11]], [[TMP14]]
+// CHECK-NEXT:    [[TMP15:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4
+// CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP17]]
+// CHECK-NEXT:    [[TMP18:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP19]] to i64
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP18]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    store float [[MUL6]], float* [[ARRAYIDX8]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT]] = add nuw i32 [[OMP_LOOP_IV]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER]]
+// CHECK:       omp_loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM9]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER:%.*]]
+// CHECK:       omp_loop.after:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR1:#.*]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DISTANCE_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8
+// CHECK-NEXT:    store i32* [[DISTANCE]], i32** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP3]], 32000000
+// CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-NEXT:    [[SUB:%.*]] = sub i32 32000000, [[TMP6]]
+// CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[SUB]], 7
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32*, i32** [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[COND]], i32* [[TMP7]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LOOPVAR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[LOGICAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT:    store i32* [[LOOPVAR]], i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[LOGICAL]], i32* [[LOGICAL_ADDR]], align 4
+// CHECK-NEXT:    store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[LOGICAL_ADDR]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul i32 7, [[TMP3]]
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
--- a/clang/test/OpenMP/irbuilder_nested_parallel_for.c
+++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
@@ -23,15 +23,15 @@
 //
 // CHECK-DEBUG-LABEL: @_Z14parallel_for_0v(
 // CHECK-DEBUG-NEXT:  entry:
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]]), [[DBG10:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]]), [[DBG12:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK-DEBUG:       omp_parallel:
-// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @_Z14parallel_for_0v..omp_par to void (i32*, i32*, ...)*)), [[DBG11:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @_Z14parallel_for_0v..omp_par to void (i32*, i32*, ...)*)), [[DBG13:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 // CHECK-DEBUG:       omp.par.outlined.exit:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK-DEBUG:       omp.par.exit.split:
-// CHECK-DEBUG-NEXT:    ret void, [[DBG14:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    ret void, [[DBG17:!dbg !.*]]
 //
 void parallel_for_0(void) {
 #pragma omp parallel
@@ -53,9 +53,9 @@
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 // CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK:       omp_parallel:
-// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.1 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]])
-// CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT19:%.*]]
-// CHECK:       omp.par.outlined.exit19:
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]])
+// CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT16:%.*]]
+// CHECK:       omp.par.outlined.exit16:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK:       omp.par.exit.split:
 // CHECK-NEXT:    ret void
@@ -66,20 +66,20 @@
 // CHECK-DEBUG-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
 // CHECK-DEBUG-NEXT:    store float* [[R:%.*]], float** [[R_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META41:metadata !.*]], metadata !DIExpression()), [[DBG42:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META67:metadata !.*]], metadata !DIExpression()), [[DBG68:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META43:metadata !.*]], metadata !DIExpression()), [[DBG44:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META69:metadata !.*]], metadata !DIExpression()), [[DBG70:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META45:metadata !.*]], metadata !DIExpression()), [[DBG46:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB10:@.*]]), [[DBG47:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META71:metadata !.*]], metadata !DIExpression()), [[DBG72:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB6:@.*]]), [[DBG73:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK-DEBUG:       omp_parallel:
-// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB10]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.1 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG48:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT19:%.*]]
-// CHECK-DEBUG:       omp.par.outlined.exit19:
+// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB6]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG74:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT16:%.*]]
+// CHECK-DEBUG:       omp.par.outlined.exit16:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK-DEBUG:       omp.par.exit.split:
-// CHECK-DEBUG-NEXT:    ret void, [[DBG50:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    ret void, [[DBG76:!dbg !.*]]
 //
 void parallel_for_1(float *r, int a, double b) {
 #pragma omp parallel
@@ -99,76 +99,72 @@
 // CHECK-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
-// CHECK-NEXT:    [[DOTOMP_IV212:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[TMP213:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[DOTOMP_LB214:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[DOTOMP_UB215:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[DOTOMP_STRIDE216:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[DOTOMP_IS_LAST217:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[I218:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I185:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED186:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8
+// CHECK-NEXT:    [[AGG_CAPTURED187:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4
+// CHECK-NEXT:    [[DOTCOUNT_ADDR188:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LASTITER203:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND204:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_UPPERBOUND205:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_STRIDE206:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store float* [[R:%.*]], float** [[R_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 // CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK:       omp_parallel:
-// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]])
-// CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT211:%.*]]
-// CHECK:       omp.par.outlined.exit211:
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]])
+// CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT184:%.*]]
+// CHECK:       omp.par.outlined.exit184:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK:       omp.par.exit.split:
-// CHECK-NEXT:    store i32 0, i32* [[DOTOMP_LB214]], align 4
-// CHECK-NEXT:    store i32 99, i32* [[DOTOMP_UB215]], align 4
-// CHECK-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE216]], align 4
-// CHECK-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST217]], align 4
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM219:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB41:@.*]])
-// CHECK-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM219]], i32 34, i32* [[DOTOMP_IS_LAST217]], i32* [[DOTOMP_LB214]], i32* [[DOTOMP_UB215]], i32* [[DOTOMP_STRIDE216]], i32 1, i32 1)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4
-// CHECK-NEXT:    [[CMP220:%.*]] = icmp sgt i32 [[TMP0]], 99
-// CHECK-NEXT:    br i1 [[CMP220]], label [[COND_TRUE221:%.*]], label [[COND_FALSE222:%.*]]
-// CHECK:       cond.true221:
-// CHECK-NEXT:    br label [[COND_END223:%.*]]
-// CHECK:       cond.false222:
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4
-// CHECK-NEXT:    br label [[COND_END223]]
-// CHECK:       cond.end223:
-// CHECK-NEXT:    [[COND224:%.*]] = phi i32 [ 99, [[COND_TRUE221]] ], [ [[TMP1]], [[COND_FALSE222]] ]
-// CHECK-NEXT:    store i32 [[COND224]], i32* [[DOTOMP_UB215]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB214]], align 4
-// CHECK-NEXT:    store i32 [[TMP2]], i32* [[DOTOMP_IV212]], align 4
-// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND225:%.*]]
-// CHECK:       omp.inner.for.cond225:
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4
-// CHECK-NEXT:    [[CMP226:%.*]] = icmp sle i32 [[TMP3]], [[TMP4]]
-// CHECK-NEXT:    br i1 [[CMP226]], label [[OMP_INNER_FOR_BODY227:%.*]], label [[OMP_INNER_FOR_END236:%.*]]
-// CHECK:       omp.inner.for.body227:
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4
-// CHECK-NEXT:    [[MUL228:%.*]] = mul nsw i32 [[TMP5]], 1
-// CHECK-NEXT:    [[ADD229:%.*]] = add nsw i32 0, [[MUL228]]
-// CHECK-NEXT:    store i32 [[ADD229]], i32* [[I218]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
-// CHECK-NEXT:    [[CONV230:%.*]] = sitofp i32 [[TMP6]] to double
-// CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[B_ADDR]], align 8
-// CHECK-NEXT:    [[ADD231:%.*]] = fadd double [[CONV230]], [[TMP7]]
-// CHECK-NEXT:    [[CONV232:%.*]] = fptrunc double [[ADD231]] to float
-// CHECK-NEXT:    [[TMP8:%.*]] = load float*, float** [[R_ADDR]], align 8
-// CHECK-NEXT:    store float [[CONV232]], float* [[TMP8]], align 4
-// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE233:%.*]]
-// CHECK:       omp.body.continue233:
-// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC234:%.*]]
-// CHECK:       omp.inner.for.inc234:
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4
-// CHECK-NEXT:    [[ADD235:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK-NEXT:    store i32 [[ADD235]], i32* [[DOTOMP_IV212]], align 4
-// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND225]]
-// CHECK:       omp.inner.for.end236:
-// CHECK-NEXT:    br label [[OMP_LOOP_EXIT237:%.*]]
-// CHECK:       omp.loop.exit237:
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM238:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB43:@.*]])
-// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM238]])
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM239:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB7:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM239]])
+// CHECK-NEXT:    store i32 0, i32* [[I185]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0
+// CHECK-NEXT:    store i32* [[I185]], i32** [[TMP0]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I185]], align 4
+// CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP1]], align 4
+// CHECK-NEXT:    call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]])
+// CHECK-NEXT:    [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER190:%.*]]
+// CHECK:       omp_loop.preheader190:
+// CHECK-NEXT:    store i32 0, i32* [[P_LOWERBOUND204]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1
+// CHECK-NEXT:    store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4
+// CHECK-NEXT:    store i32 1, i32* [[P_STRIDE206]], align 4
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER191:%.*]]
+// CHECK:       omp_loop.header191:
+// CHECK-NEXT:    [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND192:%.*]]
+// CHECK:       omp_loop.cond192:
+// CHECK-NEXT:    [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]]
+// CHECK:       omp_loop.body193:
+// CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]]
+// CHECK-NEXT:    call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]])
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-NEXT:    [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double
+// CHECK-NEXT:    [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8
+// CHECK-NEXT:    [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]]
+// CHECK-NEXT:    [[CONV202:%.*]] = fptrunc double [[ADD201]] to float
+// CHECK-NEXT:    [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8
+// CHECK-NEXT:    store float [[CONV202]], float* [[TMP11]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC194]]
+// CHECK:       omp_loop.inc194:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER191]]
+// CHECK:       omp_loop.exit195:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM207]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM208]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER196:%.*]]
+// CHECK:       omp_loop.after196:
 // CHECK-NEXT:    ret void
 //
 // CHECK-DEBUG-LABEL: @_Z14parallel_for_2Pfid(
@@ -176,86 +172,77 @@
 // CHECK-DEBUG-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // CHECK-DEBUG-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
-// CHECK-DEBUG-NEXT:    [[DOTOMP_IV212:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[TMP213:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[DOTOMP_LB214:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[DOTOMP_UB215:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[DOTOMP_STRIDE216:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[DOTOMP_IS_LAST217:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[I218:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[I185:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED186:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED187:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR188:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LASTITER203:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LOWERBOUND204:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_UPPERBOUND205:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_STRIDE206:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    store float* [[R:%.*]], float** [[R_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META77:metadata !.*]], metadata !DIExpression()), [[DBG78:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META123:metadata !.*]], metadata !DIExpression()), [[DBG124:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META79:metadata !.*]], metadata !DIExpression()), [[DBG80:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META125:metadata !.*]], metadata !DIExpression()), [[DBG126:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META81:metadata !.*]], metadata !DIExpression()), [[DBG82:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB21:@.*]]), [[DBG83:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META127:metadata !.*]], metadata !DIExpression()), [[DBG128:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB13:@.*]]), [[DBG129:!dbg !.*]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK-DEBUG:       omp_parallel:
-// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB21]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG84:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT211:%.*]]
-// CHECK-DEBUG:       omp.par.outlined.exit211:
+// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB13]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG130:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT184:%.*]]
+// CHECK-DEBUG:       omp.par.outlined.exit184:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK-DEBUG:       omp.par.exit.split:
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IV212]], [[META87:metadata !.*]], metadata !DIExpression()), [[DBG89:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[DOTOMP_LB214]], [[META90:metadata !.*]], metadata !DIExpression()), [[DBG89]]
-// CHECK-DEBUG-NEXT:    store i32 0, i32* [[DOTOMP_LB214]], align 4, [[DBG91:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[DOTOMP_UB215]], [[META92:metadata !.*]], metadata !DIExpression()), [[DBG89]]
-// CHECK-DEBUG-NEXT:    store i32 99, i32* [[DOTOMP_UB215]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[DOTOMP_STRIDE216]], [[META93:metadata !.*]], metadata !DIExpression()), [[DBG89]]
-// CHECK-DEBUG-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE216]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IS_LAST217]], [[META94:metadata !.*]], metadata !DIExpression()), [[DBG89]]
-// CHECK-DEBUG-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST217]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[I218]], [[META95:metadata !.*]], metadata !DIExpression()), [[DBG89]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM219:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB79:@.*]])
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* [[GLOB78:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM219]], i32 34, i32* [[DOTOMP_IS_LAST217]], i32* [[DOTOMP_LB214]], i32* [[DOTOMP_UB215]], i32* [[DOTOMP_STRIDE216]], i32 1, i32 1), [[DBG96:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    [[CMP220:%.*]] = icmp sgt i32 [[TMP0]], 99, [[DBG91]]
-// CHECK-DEBUG-NEXT:    br i1 [[CMP220]], label [[COND_TRUE221:%.*]], label [[COND_FALSE222:%.*]], [[DBG91]]
-// CHECK-DEBUG:       cond.true221:
-// CHECK-DEBUG-NEXT:    br label [[COND_END223:%.*]], [[DBG91]]
-// CHECK-DEBUG:       cond.false222:
-// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    br label [[COND_END223]], [[DBG91]]
-// CHECK-DEBUG:       cond.end223:
-// CHECK-DEBUG-NEXT:    [[COND224:%.*]] = phi i32 [ 99, [[COND_TRUE221]] ], [ [[TMP1]], [[COND_FALSE222]] ], [[DBG91]]
-// CHECK-DEBUG-NEXT:    store i32 [[COND224]], i32* [[DOTOMP_UB215]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB214]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    store i32 [[TMP2]], i32* [[DOTOMP_IV212]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_INNER_FOR_COND225:%.*]], [[DBG97:!dbg !.*]]
-// CHECK-DEBUG:       omp.inner.for.cond225:
-// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    [[CMP226:%.*]] = icmp sle i32 [[TMP3]], [[TMP4]], [[DBG98:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    br i1 [[CMP226]], label [[OMP_INNER_FOR_BODY227:%.*]], label [[OMP_INNER_FOR_END236:%.*]], [[DBG97]]
-// CHECK-DEBUG:       omp.inner.for.body227:
-// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    [[MUL228:%.*]] = mul nsw i32 [[TMP5]], 1, [[DBG99:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[ADD229:%.*]] = add nsw i32 0, [[MUL228]], [[DBG99]]
-// CHECK-DEBUG-NEXT:    store i32 [[ADD229]], i32* [[I218]], align 4, [[DBG99]]
-// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4, [[DBG100:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[CONV230:%.*]] = sitofp i32 [[TMP6]] to double, [[DBG100]]
-// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = load double, double* [[B_ADDR]], align 8, [[DBG101:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[ADD231:%.*]] = fadd double [[CONV230]], [[TMP7]], [[DBG102:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[CONV232:%.*]] = fptrunc double [[ADD231]] to float, [[DBG100]]
-// CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = load float*, float** [[R_ADDR]], align 8, [[DBG103:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    store float [[CONV232]], float* [[TMP8]], align 4, [[DBG104:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_BODY_CONTINUE233:%.*]], [[DBG105:!dbg !.*]]
-// CHECK-DEBUG:       omp.body.continue233:
-// CHECK-DEBUG-NEXT:    br label [[OMP_INNER_FOR_INC234:%.*]], [[DBG96]]
-// CHECK-DEBUG:       omp.inner.for.inc234:
-// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, [[DBG91]]
-// CHECK-DEBUG-NEXT:    [[ADD235:%.*]] = add nsw i32 [[TMP9]], 1, [[DBG98]]
-// CHECK-DEBUG-NEXT:    store i32 [[ADD235]], i32* [[DOTOMP_IV212]], align 4, [[DBG98]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_INNER_FOR_COND225]], [[DBG96]], [[LOOP106:!llvm.loop !.*]]
-// CHECK-DEBUG:       omp.inner.for.end236:
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_EXIT237:%.*]], [[DBG96]]
-// CHECK-DEBUG:       omp.loop.exit237:
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM238:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB82:@.*]])
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB81:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM238]]), [[DBG107:!dbg !.*]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM239:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB82]]), [[DBG107]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB83:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM239]]), [[DBG107]]
-// CHECK-DEBUG-NEXT:    ret void, [[DBG108:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[I185]], [[META134:metadata !.*]], metadata !DIExpression()), [[DBG137:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    store i32 0, i32* [[I185]], align 4, [[DBG137]]
+// CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0, [[DBG138:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    store i32* [[I185]], i32** [[TMP0]], align 8, [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0, [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I185]], align 4, [[DBG139:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    store i32 [[TMP2]], i32* [[TMP1]], align 4, [[DBG138]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]]), [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4, [[DBG138]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER190:%.*]], [[DBG138]]
+// CHECK-DEBUG:       omp_loop.preheader190:
+// CHECK-DEBUG-NEXT:    store i32 0, i32* [[P_LOWERBOUND204]], align 4, [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1, [[DBG138]]
+// CHECK-DEBUG-NEXT:    store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4, [[DBG138]]
+// CHECK-DEBUG-NEXT:    store i32 1, i32* [[P_STRIDE206]], align 4, [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB42:@.*]]), [[DBG138]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* [[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 1), [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4, [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4, [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]], [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1, [[DBG138]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191:%.*]], [[DBG138]]
+// CHECK-DEBUG:       omp_loop.header191:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ], [[DBG138]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND192:%.*]], [[DBG138]]
+// CHECK-DEBUG:       omp_loop.cond192:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]], [[DBG138]]
+// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]], [[DBG138]]
+// CHECK-DEBUG:       omp_loop.body193:
+// CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]], [[DBG138]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]]), [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, [[DBG140:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double, [[DBG140]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8, [[DBG141:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]], [[DBG142:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    [[CONV202:%.*]] = fptrunc double [[ADD201]] to float, [[DBG140]]
+// CHECK-DEBUG-NEXT:    [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8, [[DBG143:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    store float [[CONV202]], float* [[TMP11]], align 4, [[DBG144:!dbg !.*]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC194]], [[DBG138]]
+// CHECK-DEBUG:       omp_loop.inc194:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1, [[DBG138]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191]], [[DBG138]]
+// CHECK-DEBUG:       omp_loop.exit195:
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]]), [[DBG138]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB42]]), [[DBG141]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB43:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM208]]), [[DBG141]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER196:%.*]], [[DBG138]]
+// CHECK-DEBUG:       omp_loop.after196:
+// CHECK-DEBUG-NEXT:    ret void, [[DBG145:!dbg !.*]]
 //
 void parallel_for_2(float *r, int a, double b) {
 #pragma omp parallel
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2037,6 +2037,7 @@
   void VisitPseudoObjectExpr(const PseudoObjectExpr *E);
   void VisitOpaqueValueExpr(const OpaqueValueExpr *E);
   void VisitLambdaExpr(const LambdaExpr *E);
+  void VisitOMPCanonicalLoop(const OMPCanonicalLoop *L);
   void VisitOMPExecutableDirective(const OMPExecutableDirective *D);
   void VisitOMPLoopDirective(const OMPLoopDirective *D);
   void VisitOMPParallelDirective(const OMPParallelDirective *D);
@@ -2833,6 +2834,11 @@
   Visit(E->getSyntacticForm());
 }
 
+void EnqueueVisitor::VisitOMPCanonicalLoop(const OMPCanonicalLoop *L) {
+  VisitStmt(L);
+  EnqueueChildren(L);
+}
+
 void EnqueueVisitor::VisitOMPExecutableDirective(
     const OMPExecutableDirective *D) {
   EnqueueChildren(D);
@@ -5649,6 +5655,8 @@
     return cxstring::createRef("attribute(warn_unused_result)");
   case CXCursor_AlignedAttr:
     return cxstring::createRef("attribute(aligned)");
+  case CXCursor_OMPCanonicalLoop:
+    return cxstring::createRef("OMPCanonicalLoop");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -639,6 +639,9 @@
   case Stmt::MSDependentExistsStmtClass:
     K = CXCursor_UnexposedStmt;
     break;
+  case Stmt::OMPCanonicalLoopClass:
+    K = CXCursor_OMPCanonicalLoop;
+    break;
   case Stmt::OMPParallelDirectiveClass:
     K = CXCursor_OMPParallelDirective;
     break;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -31,6 +31,7 @@
   /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
   /// not have an effect on \p M (see initialize).
   OpenMPIRBuilder(Module &M) : M(M), Builder(M.getContext()) {}
+  ~OpenMPIRBuilder();
 
   /// Initialize the internal state, this will put structures types and
   /// potentially other helpers into the underlying module. Must be called
@@ -38,10 +39,11 @@
   void initialize();
 
   /// Finalize the underlying module, e.g., by outlining regions.
+  /// \param Fn                    The function to be finalized.
   /// \param AllowExtractorSinking Flag to include sinking instructions,
   ///                              emitted by CodeExtractor, in the
   ///                              outlined region. Default is false.
-  void finalize(bool AllowExtractorSinking = false);
+  void finalize(Function *Fn, bool AllowExtractorSinking = false);
 
   /// Add attributes known for \p FnID to \p Fn.
   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
@@ -364,6 +366,31 @@
                                                bool NeedsBarrier,
                                                Value *Chunk = nullptr);
 
+  /// Modifies the canonical loop to be a workshare loop.
+  ///
+  /// This takes a \p LoopInfo representing a canonical loop, such as the one
+  /// created by \p createCanonicalLoop and emits additional instructions to
+  /// turn it into a workshare loop. In particular, it calls to an OpenMP
+  /// runtime function in the preheader to obtain the loop bounds to be used in
+  /// the current thread, updates the relevant instructions in the canonical
+  /// loop and calls to an OpenMP runtime finalization function after the loop.
+  ///
+  /// \param Loc      The source location description, the insertion location
+  ///                 is not used.
+  /// \param CLI      A descriptor of the canonical loop to workshare.
+  /// \param AllocaIP An insertion point for Alloca instructions usable in the
+  ///                 preheader of the loop.
+  /// \param NeedsBarrier Indicates whether a barrier must be insterted after
+  ///                     the loop.
+  /// \param Chunk    The size of loop chunk considered as a unit when
+  ///                 scheduling. If \p nullptr, defaults to 1.
+  ///
+  /// \returns Updated CanonicalLoopInfo.
+  CanonicalLoopInfo *createWorkshareLoop(const LocationDescription &Loc,
+                                         CanonicalLoopInfo *CLI,
+                                         InsertPointTy AllocaIP,
+                                         bool NeedsBarrier);
+
   /// Tile a loop nest.
   ///
   /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
@@ -543,6 +570,9 @@
     /// vector and set.
     void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
                        SmallVectorImpl<BasicBlock *> &BlockVector);
+
+    /// Return the function that contains the region to be outlined.
+    Function *getFunction() const { return EntryBB->getParent(); }
   };
 
   /// Collection of regions that need to be outlined during finalization.
@@ -916,6 +946,8 @@
     return {After, After->begin()};
   };
 
+  Function *getFunction() const { return Header->getParent(); }
+
   /// Consistency self-check.
   void assertOK() const;
 };
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -126,15 +126,23 @@
 
 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 
-void OpenMPIRBuilder::finalize(bool AllowExtractorSinking) {
+void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
+  SmallVector<OutlineInfo, 16> DeferredOutlines;
   for (OutlineInfo &OI : OutlineInfos) {
+    // Skip functions that have not finalized yet; may happen with nested
+    // function generation.
+    if (OI.getFunction() != Fn) {
+      DeferredOutlines.push_back(OI);
+      continue;
+    }
+
     ParallelRegionBlockSet.clear();
     Blocks.clear();
     OI.collectBlocks(ParallelRegionBlockSet, Blocks);
 
-    Function *OuterFn = OI.EntryBB->getParent();
+    Function *OuterFn = OI.getFunction();
     CodeExtractorAnalysisCache CEAC(*OuterFn);
     CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
                             /* AggregateArgs */ false,
@@ -199,8 +207,12 @@
       OI.PostOutlineCB(*OutlinedFn);
   }
 
-  // Allow finalize to be called multiple times.
-  OutlineInfos.clear();
+  // Remove work items that have been completed.
+  OutlineInfos = std::move(DeferredOutlines);
+}
+
+OpenMPIRBuilder::~OpenMPIRBuilder() {
+  assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
 }
 
 Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
@@ -1164,6 +1176,13 @@
   return CLI;
 }
 
+CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(
+    const LocationDescription &Loc, CanonicalLoopInfo *CLI,
+    InsertPointTy AllocaIP, bool NeedsBarrier) {
+  // Currently only supports static schedules.
+  return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
+}
+
 /// Make \p Source branch to \p Target.
 ///
 /// Handles two situations:
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -794,7 +794,8 @@
       BranchInst::Create(AfterBB, AfterIP.getBlock());
 
       // Perform the actual outlining.
-      OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
+      OMPInfoCache.OMPBuilder.finalize(OriginalFn,
+                                       /* AllowExtractorSinking */ true);
 
       Function *OutlinedFn = MergableCIs.front()->getCaller();