Index: include/clang/AST/StmtOpenMP.h
===================================================================
--- include/clang/AST/StmtOpenMP.h
+++ include/clang/AST/StmtOpenMP.h
@@ -314,6 +314,9 @@
   friend class ASTStmtReader;
   /// \brief Number of collapsed loops as specified by 'collapse' clause.
   unsigned CollapsedNum;
+  /// \brief DefaultScheduleKind - Schedule type to use for a given target
+  /// if no 'schedule' clause or a 'schedule' type 'auto' is specified.
+  OpenMPDefaultScheduleKind DefaultScheduleKind;
 
   /// \brief Offsets to the stored exprs.
   /// This enumeration contains offsets to all the pointers to children
@@ -412,7 +415,7 @@
       : OMPExecutableDirective(That, SC, Kind, StartLoc, EndLoc, NumClauses,
                                numLoopChildren(CollapsedNum, Kind) +
                                    NumSpecialChildren),
-        CollapsedNum(CollapsedNum) {}
+        CollapsedNum(CollapsedNum), DefaultScheduleKind(OMPDSK_unknown) {}
 
   /// \brief Offset to the start of children expression arrays.
   static unsigned getArraysOffset(OpenMPDirectiveKind Kind) {
@@ -521,6 +524,9 @@
            "expected worksharing loop directive");
     *std::next(child_begin(), PrevUpperBoundVariableOffset) = PrevUB;
   }
+  void setDefaultSchedule(OpenMPDefaultScheduleKind SK) {
+    DefaultScheduleKind = SK;
+  }
   void setCounters(ArrayRef<Expr *> A);
   void setPrivateCounters(ArrayRef<Expr *> A);
   void setInits(ArrayRef<Expr *> A);
@@ -567,6 +573,9 @@
     /// \brief PreviousUpperBound - local variable passed to runtime in the
     /// enclosing schedule or null if that does not apply.
     Expr *PrevUB;
+    /// \brief DefaultScheduleKind - Schedule type to use for the given target
+    /// if no 'schedule' clause or a 'schedule' type 'auto' is specified.
+    OpenMPDefaultScheduleKind DefaultScheduleKind;
     /// \brief Counters Loop counters.
     SmallVector<Expr *, 4> Counters;
     /// \brief PrivateCounters Loop counters.
@@ -608,6 +617,7 @@
       NumIterations = nullptr;
       PrevLB = nullptr;
       PrevUB = nullptr;
+      DefaultScheduleKind = OMPDSK_unknown;
       Counters.resize(Size);
       PrivateCounters.resize(Size);
       Inits.resize(Size);
@@ -739,6 +749,9 @@
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), PrevUpperBoundVariableOffset)));
   }
+  OpenMPDefaultScheduleKind getDefaultSchedule() const {
+    return DefaultScheduleKind;
+  }
   const Stmt *getBody() const {
     // This relies on the loop form is already checked by Sema.
     Stmt *Body = getAssociatedStmt()->IgnoreContainers(true);
Index: include/clang/Basic/OpenMPKinds.h
===================================================================
--- include/clang/Basic/OpenMPKinds.h
+++ include/clang/Basic/OpenMPKinds.h
@@ -127,6 +127,9 @@
   OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown;
 };
 
+/// Default schedule type for any loop-based (#for) OpenMP directive.
+enum OpenMPDefaultScheduleKind { OMPDSK_static_chunkone, OMPDSK_unknown };
+
 OpenMPDirectiveKind getOpenMPDirectiveKind(llvm::StringRef Str);
 const char *getOpenMPDirectiveName(OpenMPDirectiveKind Kind);
 
Index: lib/AST/StmtOpenMP.cpp
===================================================================
--- lib/AST/StmtOpenMP.cpp
+++ lib/AST/StmtOpenMP.cpp
@@ -105,6 +105,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -156,6 +157,7 @@
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
+  Dir->setDefaultSchedule(Exprs.DefaultScheduleKind);
   return Dir;
 }
 
@@ -207,6 +209,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -373,6 +376,7 @@
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -423,6 +427,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -760,6 +765,7 @@
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1003,6 +1009,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1077,6 +1084,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1133,6 +1141,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1188,6 +1197,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1242,6 +1252,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1336,6 +1347,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1389,6 +1401,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1444,6 +1457,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1502,6 +1516,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1582,6 +1597,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1640,6 +1656,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1701,6 +1718,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1759,6 +1777,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -2187,13 +2187,35 @@
       }
       const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
       const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+      // For NVPTX and other GPU targets high performance is often achieved
+      // if adjacent threads access memory in a coalesced manner.  This is
+      // true for loops that access memory with stride one if a static
+      // schedule with chunk size of 1 is used.  We generate such code
+      // whenever the OpenMP standard gives us freedom to do so.
+      //
+      // This case is called if there is no schedule clause, with a
+      // schedule(auto), or with a schedule(static,1).
+      //
+      // Codegen is optimized for this case.  Since chunk size is 1 we do not
+      // need to generate the inner loop, i.e., the chunk iterator can be
+      // removed.
+      // while(idx < GlobalUB) {
+      //   BODY;
+      //   idx += ST;
+      // }
+      if (S.getDefaultSchedule() == OMPDSK_static_chunkone) {
+        ScheduleKind.Schedule = OMPC_SCHEDULE_static;
+        if (!Chunk) // Force use of chunk=1
+          Chunk = Builder.getIntN(IVSize, 1);
+      }
       // OpenMP 4.5, 2.7.1 Loop Construct, Description.
       // If the static schedule kind is specified or if the ordered clause is
       // specified, and if no monotonic modifier is specified, the effect will
       // be as if the monotonic modifier was specified.
-      if (RT.isStaticNonchunked(ScheduleKind.Schedule,
-                                /* Chunked */ Chunk != nullptr) &&
-          !Ordered) {
+      if (S.getDefaultSchedule() == OMPDSK_static_chunkone ||
+          (RT.isStaticNonchunked(ScheduleKind.Schedule,
+                                 /* Chunked */ Chunk != nullptr) &&
+           !Ordered)) {
         if (isOpenMPSimdDirective(S.getDirectiveKind()))
           EmitOMPSimdInit(S, /*IsMonotonic=*/true);
         // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
@@ -2201,17 +2223,21 @@
         // chunks that are approximately equal in size, and at most one chunk is
         // distributed to each thread. Note that the size of the chunks is
         // unspecified in this case.
-        RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind,
-                             IVSize, IVSigned, Ordered,
-                             IL.getAddress(), LB.getAddress(),
-                             UB.getAddress(), ST.getAddress());
+        RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind, IVSize,
+                             IVSigned, Ordered, IL.getAddress(),
+                             LB.getAddress(), UB.getAddress(), ST.getAddress(),
+                             Chunk);
         auto LoopExit =
             getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit"));
-        // UB = min(UB, GlobalUB);
-        EmitIgnoredExpr(S.getEnsureUpperBound());
+        if (S.getDefaultSchedule() != OMPDSK_static_chunkone) {
+          // UB = min(UB, GlobalUB);
+          EmitIgnoredExpr(S.getEnsureUpperBound());
+        }
         // IV = LB;
         EmitIgnoredExpr(S.getInit());
         // while (idx <= UB) { BODY; ++idx; }
+        // if OMPDSK_static_chunkone:
+        //   while (idx <= GlobalUB) { BODY; idx += ST; }
         EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(),
                          S.getInc(),
                          [&S, LoopExit](CodeGenFunction &CGF) {
Index: lib/Sema/SemaOpenMP.cpp
===================================================================
--- lib/Sema/SemaOpenMP.cpp
+++ lib/Sema/SemaOpenMP.cpp
@@ -3855,15 +3855,71 @@
   return PostUpdate;
 }
 
+/// Get the default schedule type for any loop-based OpenMP directive,
+/// specialized for a particular target.  This is used to guide codegen
+/// if a) no 'schedule' clause is specified, or b) a 'schedule' type of
+/// 'auto' is specified by the user.
+static OpenMPDefaultScheduleKind
+getDefaultSchedule(Sema &S, OpenMPDirectiveKind Kind,
+                   ArrayRef<OMPClause *> Clauses) {
+  OpenMPDefaultScheduleKind DefaultSchedule = OMPDSK_unknown;
+
+  if (S.getLangOpts().OpenMPIsDevice &&
+      S.Context.getTargetInfo().getTriple().isNVPTX()) {
+    // Force a schedule type of (static,1) if there is no schedule clause, or
+    // the user specifies schedule(auto) or schedule(static,1).
+    bool ChunkSizeOne = false;
+    auto ScheduleKind = OMPC_SCHEDULE_unknown;
+    auto ScheduleClause =
+        OMPExecutableDirective::getClausesOfKind<OMPScheduleClause>(Clauses);
+    if (ScheduleClause.begin() != ScheduleClause.end()) {
+      ScheduleKind = (*ScheduleClause.begin())->getScheduleKind();
+      if (const auto *Ch = (*ScheduleClause.begin())->getChunkSize()) {
+        if (!Ch->isValueDependent() && !Ch->isTypeDependent() &&
+            !Ch->isInstantiationDependent() &&
+            !Ch->containsUnexpandedParameterPack()) {
+          SourceLocation ChLoc = Ch->getLocStart();
+          ExprResult Val = S.PerformOpenMPImplicitIntegerConversion(
+              ChLoc, const_cast<Expr *>(Ch));
+          if (!Val.isInvalid()) {
+            Expr *ValExpr = Val.get();
+            llvm::APSInt Result;
+            ChunkSizeOne = ValExpr->isIntegerConstantExpr(Result, S.Context) &&
+                           Result == 1;
+          }
+        }
+      }
+    }
+
+    // Ordered clause requires dynamic dispatch.
+    auto OrderedClause =
+        OMPExecutableDirective::getClausesOfKind<OMPOrderedClause>(Clauses);
+    bool Ordered = OrderedClause.begin() != OrderedClause.end();
+
+    bool StaticOneSchedule =
+        (!Ordered && (ScheduleKind == OMPC_SCHEDULE_unknown ||
+                      ScheduleKind == OMPC_SCHEDULE_auto ||
+                      (ScheduleKind == OMPC_SCHEDULE_static && ChunkSizeOne)));
+
+    if (StaticOneSchedule)
+      DefaultSchedule = OMPDSK_static_chunkone;
+  }
+
+  return DefaultSchedule;
+}
+
 /// \brief Called on a for stmt to check itself and nested loops (if any).
 /// \return Returns 0 if one of the collapsed stmts is not canonical for loop,
 /// number of collapsed loops otherwise.
 static unsigned
-CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
-                Expr *OrderedLoopCountExpr, Stmt *AStmt, Sema &SemaRef,
-                DSAStackTy &DSA,
+CheckOpenMPLoop(OpenMPDirectiveKind DKind, ArrayRef<OMPClause *> Clauses,
+                Expr *CollapseLoopCountExpr, Expr *OrderedLoopCountExpr,
+                Stmt *AStmt, Sema &SemaRef, DSAStackTy &DSA,
                 llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA,
                 OMPLoopDirective::HelperExprs &Built) {
+  OpenMPDefaultScheduleKind DefaultScheduleKind =
+      getDefaultSchedule(SemaRef, DKind, Clauses);
+
   unsigned NestedLoopCount = 1;
   if (CollapseLoopCountExpr) {
     // Found 'collapse' clause - calculate collapse number.
@@ -4136,17 +4192,20 @@
   // Loop condition (IV < NumIterations) or (IV <= UB) for worksharing loops.
   SourceLocation CondLoc;
   ExprResult Cond =
-      (isOpenMPWorksharingDirective(DKind) ||
-       isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind))
+      (DefaultScheduleKind != OMPDSK_static_chunkone &&
+       (isOpenMPWorksharingDirective(DKind) ||
+        isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind)))
           ? SemaRef.BuildBinOp(CurScope, CondLoc, BO_LE, IV.get(), UB.get())
           : SemaRef.BuildBinOp(CurScope, CondLoc, BO_LT, IV.get(),
                                NumIterations.get());
 
-  // Loop increment (IV = IV + 1)
+  // Loop increment (IV = IV + 1) or (IV = IV + ST) if (static,1) scheduling.
   SourceLocation IncLoc;
   ExprResult Inc =
-      SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(),
-                         SemaRef.ActOnIntegerConstant(IncLoc, 1).get());
+      DefaultScheduleKind == OMPDSK_static_chunkone
+          ? SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(), ST.get())
+          : SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(),
+                               SemaRef.ActOnIntegerConstant(IncLoc, 1).get());
   if (!Inc.isUsable())
     return 0;
   Inc = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, IV.get(), Inc.get());
@@ -4295,6 +4354,7 @@
   Built.NUB = NextUB.get();
   Built.PrevLB = PrevLB.get();
   Built.PrevUB = PrevUB.get();
+  Built.DefaultScheduleKind = DefaultScheduleKind;
 
   Expr *CounterVal = SemaRef.DefaultLvalueConversion(IV.get()).get();
   // Fill data for doacross depend clauses.
@@ -4417,9 +4477,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_simd, Clauses, getCollapseNumberExpr(Clauses),
+                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -4456,9 +4517,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_for, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_for, Clauses, getCollapseNumberExpr(Clauses),
+                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -4493,7 +4555,7 @@
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_for_simd, Clauses, getCollapseNumberExpr(Clauses),
                       getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -4694,10 +4756,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_parallel_for, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_parallel_for, Clauses, getCollapseNumberExpr(Clauses),
+      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -4739,10 +4801,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_parallel_for_simd, Clauses, getCollapseNumberExpr(Clauses),
+      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -5697,10 +5759,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_target_parallel_for, Clauses, getCollapseNumberExpr(Clauses),
+      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -5905,7 +5967,7 @@
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_taskloop, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_taskloop, Clauses, getCollapseNumberExpr(Clauses),
                       /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -5936,10 +5998,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_taskloop_simd, Clauses, getCollapseNumberExpr(Clauses),
+      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -5980,7 +6042,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_distribute, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_distribute, Clauses, getCollapseNumberExpr(Clauses),
                       nullptr /*ordered not a clause on distribute*/, AStmt,
                       *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6013,7 +6075,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses),
+      OMPD_distribute_parallel_for, Clauses, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6045,10 +6107,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_distribute_parallel_for_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6081,10 +6144,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses),
-                      nullptr /*ordered not a clause on distribute*/, AStmt,
-                      *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_distribute_simd, Clauses, getCollapseNumberExpr(Clauses),
+      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6118,7 +6181,7 @@
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses),
+      OMPD_target_parallel_for_simd, Clauses, getCollapseNumberExpr(Clauses),
       getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6164,7 +6227,7 @@
   // In presence of clause 'collapse' with number of loops, it will define the
   // nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_target_simd, Clauses, getCollapseNumberExpr(Clauses),
                       getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6210,10 +6273,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses),
-                      nullptr /*ordered not a clause on distribute*/, AStmt,
-                      *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_teams_distribute, Clauses, getCollapseNumberExpr(Clauses),
+      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6244,7 +6307,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses),
+      OMPD_teams_distribute_simd, Clauses, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
 
@@ -6291,10 +6354,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_teams_distribute_parallel_for_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
 
   if (NestedLoopCount == 0)
     return StmtError();
@@ -6339,10 +6403,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_teams_distribute_parallel_for, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
 
   if (NestedLoopCount == 0)
     return StmtError();
@@ -6406,8 +6471,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute,
-      getCollapseNumberExpr(Clauses),
+      OMPD_target_teams_distribute, Clauses, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6439,11 +6503,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_parallel_for,
-      getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_teams_distribute_parallel_for, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6484,11 +6548,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_parallel_for_simd,
-      getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_teams_distribute_parallel_for_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6530,10 +6594,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_teams_distribute_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
Index: test/OpenMP/nvptx_coalesced_scheduling_codegen.cpp
===================================================================
--- /dev/null
+++ test/OpenMP/nvptx_coalesced_scheduling_codegen.cpp
@@ -0,0 +1,322 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check that the execution mode of the target regions on the gpu is set to the right mode.
+// CHECK-DAG: {{@__omp_offloading_.+l19}}_exec_mode = weak constant i8 0
+
+template<typename tx>
+tx ftemplate() {
+  tx a[100];
+  tx b[10][10];
+
+  #pragma omp target parallel
+  {
+    #pragma omp for
+    for (int i = 0; i < 99; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(auto)
+    for (int i = 0; i < 98; i++) {
+      a[i] = 2;
+    }
+
+    #pragma omp for schedule(static,1)
+    for (int i = 0; i < 97; i++) {
+      a[i] = 3;
+    }
+
+    #pragma omp for schedule(static,2)
+    for (int i = 0; i < 96; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(static)
+    for (int i = 0; i < 95; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(auto) ordered
+    for (int i = 0; i < 94; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(runtime)
+    for (int i = 0; i < 93; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(dynamic)
+    for (int i = 0; i < 92; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(guided)
+    for (int i = 0; i < 91; i++) {
+      a[i] = 1;
+    }
+  }
+
+  return a[0] + b[9][9];
+}
+
+int bar(){
+  int a = 0;
+
+  a += ftemplate<int>();
+
+  return a;
+}
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l19}}(
+  // CHECK: call void @__kmpc_spmd_kernel_init(
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP1:@.+]](i32*
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP1]](
+
+  // No schedule clause.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 98, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  // CHECK: br label {{%?}}[[FOR_COND:.+]]
+  //
+  // CHECK: [[FOR_COND]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 99
+  // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]]
+  //
+  // [[FOR_BODY]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1
+  // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
+  // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align
+  // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align
+  // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]]
+  // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64
+  // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]]
+  // CHECK: store i32 1, i32* [[ELEM_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_CONT:.+]]
+  //
+  // CHECK: [[FOR_CONT]]
+  // CHECK: br label {{%?}}[[FOR_INC:.+]]
+  //
+  // CHECK: [[FOR_INC]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]]
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_COND]]
+  //
+  // CHECK: [[FOR_END]]
+
+
+
+  // schedule(auto) clause.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 97, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  // CHECK: br label {{%?}}[[FOR_COND:.+]]
+  //
+  // CHECK: [[FOR_COND]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 98
+  // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]]
+  //
+  // [[FOR_BODY]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1
+  // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
+  // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align
+  // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align
+  // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]]
+  // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64
+  // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]]
+  // CHECK: store i32 2, i32* [[ELEM_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_CONT:.+]]
+  //
+  // CHECK: [[FOR_CONT]]
+  // CHECK: br label {{%?}}[[FOR_INC:.+]]
+  //
+  // CHECK: [[FOR_INC]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]]
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_COND]]
+  //
+  // CHECK: [[FOR_END]]
+
+
+
+  // schedule(static,1) clause.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 96, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  // CHECK: br label {{%?}}[[FOR_COND:.+]]
+  //
+  // CHECK: [[FOR_COND]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 97
+  // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]]
+  //
+  // [[FOR_BODY]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1
+  // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
+  // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align
+  // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align
+  // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]]
+  // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64
+  // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]]
+  // CHECK: store i32 3, i32* [[ELEM_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_CONT:.+]]
+  //
+  // CHECK: [[FOR_CONT]]
+  // CHECK: br label {{%?}}[[FOR_INC:.+]]
+  //
+  // CHECK: [[FOR_INC]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]]
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_COND]]
+  //
+  // CHECK: [[FOR_END]]
+
+
+
+  // schedule(static,2) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 95, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 2)
+  // CHECK: br label {{%?}}[[DISPATCH_COND:.+]]
+  //
+  // CHECK: [[DISPATCH_COND]]
+  // CHECK: [[UB:%.+]] = load i32, i32* [[UB_PTR]], align
+  // CHECK: = icmp sgt i32 [[UB]], 95
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(static) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 94, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 34, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[UB:%.+]] = load i32, i32* [[UB_PTR]], align
+  // CHECK: = icmp sgt i32 [[UB]], 94
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(auto) ordered clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 93, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 70
+  // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]])
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(runtime) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 92, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 37
+  // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]])
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(dynamic) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 91, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 35
+  // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]])
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // CHECK: ret void
+  // CHECK: }
+
+#endif