diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1677,6 +1677,41 @@
   }
   return OS.str().str();
 }
+
+void CodeGenFunction::OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+    CodeGenFunction &CGF, const Stmt *RegionBodyStmt, InsertPointTy AllocaIP,
+    InsertPointTy CodeGenIP, Twine RegionName) {
+  CGBuilderTy &Builder = CGF.Builder;
+  Builder.restoreIP(CodeGenIP);
+  llvm::BasicBlock *FiniBB = splitBBWithSuffix(Builder, /*CreateBranch=*/false,
+                                               "." + RegionName + ".after");
+
+  {
+    OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(CGF, AllocaIP, *FiniBB);
+    CGF.EmitStmt(RegionBodyStmt);
+  }
+
+  if (Builder.saveIP().isSet())
+    Builder.CreateBr(FiniBB);
+}
+
+void CodeGenFunction::OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody(
+    CodeGenFunction &CGF, const Stmt *RegionBodyStmt, InsertPointTy AllocaIP,
+    InsertPointTy CodeGenIP, Twine RegionName) {
+  CGBuilderTy &Builder = CGF.Builder;
+  Builder.restoreIP(CodeGenIP);
+  llvm::BasicBlock *FiniBB = splitBBWithSuffix(Builder, /*CreateBranch=*/false,
+                                               "." + RegionName + ".after");
+
+  {
+    OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB(CGF, AllocaIP, *FiniBB);
+    CGF.EmitStmt(RegionBodyStmt);
+  }
+
+  if (Builder.saveIP().isSet())
+    Builder.CreateBr(FiniBB);
+}
+
 void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
   if (CGM.getLangOpts().OpenMPIRBuilder) {
     llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
@@ -1719,13 +1754,10 @@
     const CapturedStmt *CS = S.getCapturedStmt(OMPD_parallel);
     const Stmt *ParallelRegionBodyStmt = CS->getCapturedStmt();
 
-    auto BodyGenCB = [ParallelRegionBodyStmt,
-                      this](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                            llvm::BasicBlock &ContinuationBB) {
-      OMPBuilderCBHelpers::OutlinedRegionBodyRAII ORB(*this, AllocaIP,
-                                                      ContinuationBB);
-      OMPBuilderCBHelpers::EmitOMPRegionBody(*this, ParallelRegionBodyStmt,
-                                             CodeGenIP, ContinuationBB);
+    auto BodyGenCB = [&, this](InsertPointTy AllocaIP,
+                               InsertPointTy CodeGenIP) {
+      OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody(
+          *this, ParallelRegionBodyStmt, AllocaIP, CodeGenIP, "parallel");
     };
 
     CGCapturedStmtInfo CGSI(*CS, CR_OpenMP);
@@ -3983,22 +4015,17 @@
     if (CS) {
       for (const Stmt *SubStmt : CS->children()) {
         auto SectionCB = [this, SubStmt](InsertPointTy AllocaIP,
-                                         InsertPointTy CodeGenIP,
-                                         llvm::BasicBlock &FiniBB) {
-          OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP,
-                                                         FiniBB);
-          OMPBuilderCBHelpers::EmitOMPRegionBody(*this, SubStmt, CodeGenIP,
-                                                 FiniBB);
+                                         InsertPointTy CodeGenIP) {
+          OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+              *this, SubStmt, AllocaIP, CodeGenIP, "section");
         };
         SectionCBVector.push_back(SectionCB);
       }
     } else {
       auto SectionCB = [this, CapturedStmt](InsertPointTy AllocaIP,
-                                            InsertPointTy CodeGenIP,
-                                            llvm::BasicBlock &FiniBB) {
-        OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP, FiniBB);
-        OMPBuilderCBHelpers::EmitOMPRegionBody(*this, CapturedStmt, CodeGenIP,
-                                               FiniBB);
+                                            InsertPointTy CodeGenIP) {
+        OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+            *this, CapturedStmt, AllocaIP, CodeGenIP, "section");
       };
       SectionCBVector.push_back(SectionCB);
     }
@@ -4051,11 +4078,9 @@
     };
 
     auto BodyGenCB = [SectionRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                   InsertPointTy CodeGenIP,
-                                                   llvm::BasicBlock &FiniBB) {
-      OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP, FiniBB);
-      OMPBuilderCBHelpers::EmitOMPRegionBody(*this, SectionRegionBodyStmt,
-                                             CodeGenIP, FiniBB);
+                                                   InsertPointTy CodeGenIP) {
+      OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+          *this, SectionRegionBodyStmt, AllocaIP, CodeGenIP, "section");
     };
 
     LexicalScope Scope(*this, S.getSourceRange());
@@ -4134,11 +4159,9 @@
     };
 
     auto BodyGenCB = [MasterRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                  InsertPointTy CodeGenIP,
-                                                  llvm::BasicBlock &FiniBB) {
-      OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP, FiniBB);
-      OMPBuilderCBHelpers::EmitOMPRegionBody(*this, MasterRegionBodyStmt,
-                                             CodeGenIP, FiniBB);
+                                                  InsertPointTy CodeGenIP) {
+      OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+          *this, MasterRegionBodyStmt, AllocaIP, CodeGenIP, "master");
     };
 
     LexicalScope Scope(*this, S.getSourceRange());
@@ -4182,11 +4205,9 @@
     };
 
     auto BodyGenCB = [MaskedRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                  InsertPointTy CodeGenIP,
-                                                  llvm::BasicBlock &FiniBB) {
-      OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP, FiniBB);
-      OMPBuilderCBHelpers::EmitOMPRegionBody(*this, MaskedRegionBodyStmt,
-                                             CodeGenIP, FiniBB);
+                                                  InsertPointTy CodeGenIP) {
+      OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+          *this, MaskedRegionBodyStmt, AllocaIP, CodeGenIP, "masked");
     };
 
     LexicalScope Scope(*this, S.getSourceRange());
@@ -4224,11 +4245,9 @@
     };
 
     auto BodyGenCB = [CriticalRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                    InsertPointTy CodeGenIP,
-                                                    llvm::BasicBlock &FiniBB) {
-      OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP, FiniBB);
-      OMPBuilderCBHelpers::EmitOMPRegionBody(*this, CriticalRegionBodyStmt,
-                                             CodeGenIP, FiniBB);
+                                                    InsertPointTy CodeGenIP) {
+      OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+          *this, CriticalRegionBodyStmt, AllocaIP, CodeGenIP, "critical");
     };
 
     LexicalScope Scope(*this, S.getSourceRange());
@@ -5564,10 +5583,13 @@
       };
 
       auto BodyGenCB = [&S, C, this](InsertPointTy AllocaIP,
-                                     InsertPointTy CodeGenIP,
-                                     llvm::BasicBlock &FiniBB) {
+                                     InsertPointTy CodeGenIP) {
+        Builder.restoreIP(CodeGenIP);
+
         const CapturedStmt *CS = S.getInnermostCapturedStmt();
         if (C) {
+          llvm::BasicBlock *FiniBB = splitBBWithSuffix(
+              Builder, /*CreateBranch=*/false, ".ordered.after");
           llvm::SmallVector<llvm::Value *, 16> CapturedVars;
           GenerateOpenMPCapturedVars(*CS, CapturedVars);
           llvm::Function *OutlinedFn =
@@ -5575,13 +5597,11 @@
           assert(S.getBeginLoc().isValid() &&
                  "Outlined function call location must be valid.");
           ApplyDebugLocation::CreateDefaultArtificial(*this, S.getBeginLoc());
-          OMPBuilderCBHelpers::EmitCaptureStmt(*this, CodeGenIP, FiniBB,
+          OMPBuilderCBHelpers::EmitCaptureStmt(*this, CodeGenIP, *FiniBB,
                                                OutlinedFn, CapturedVars);
         } else {
-          OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP,
-                                                         FiniBB);
-          OMPBuilderCBHelpers::EmitOMPRegionBody(*this, CS->getCapturedStmt(),
-                                                 CodeGenIP, FiniBB);
+          OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+              *this, CS->getCapturedStmt(), AllocaIP, CodeGenIP, "ordered");
         }
       };
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -1791,26 +1791,17 @@
     }
 
     /// Emit the body of an OMP region
-    /// \param CGF	The Codegen function this belongs to
-    /// \param RegionBodyStmt	The body statement for the OpenMP region being
-    /// 			 generated
-    /// \param CodeGenIP	Insertion point for generating the body code.
-    /// \param FiniBB	The finalization basic block
-    static void EmitOMPRegionBody(CodeGenFunction &CGF,
-                                  const Stmt *RegionBodyStmt,
-                                  InsertPointTy CodeGenIP,
-                                  llvm::BasicBlock &FiniBB) {
-      llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
-      if (llvm::Instruction *CodeGenIPBBTI = CodeGenIPBB->getTerminator())
-        CodeGenIPBBTI->eraseFromParent();
-
-      CGF.Builder.SetInsertPoint(CodeGenIPBB);
-
-      CGF.EmitStmt(RegionBodyStmt);
-
-      if (CGF.Builder.saveIP().isSet())
-        CGF.Builder.CreateBr(&FiniBB);
-    }
+    /// \param CGF	          The Codegen function this belongs to
+    /// \param RegionBodyStmt The body statement for the OpenMP region being
+    ///                       generated
+    /// \param AllocaIP       Where to insert alloca instructions
+    /// \param CodeGenIP      Where to insert the region code
+    /// \param RegionName     Name to be used for new blocks
+    static void EmitOMPInlinedRegionBody(CodeGenFunction &CGF,
+                                         const Stmt *RegionBodyStmt,
+                                         InsertPointTy AllocaIP,
+                                         InsertPointTy CodeGenIP,
+                                         Twine RegionName);
 
     static void EmitCaptureStmt(CodeGenFunction &CGF, InsertPointTy CodeGenIP,
                                 llvm::BasicBlock &FiniBB, llvm::Function *Fn,
@@ -1830,12 +1821,25 @@
         CGF.Builder.CreateBr(&FiniBB);
     }
 
+    /// Emit the body of an OMP region that will be outlined in
+    /// OpenMPIRBuilder::finalize().
+    /// \param CGF	          The Codegen function this belongs to
+    /// \param RegionBodyStmt The body statement for the OpenMP region being
+    ///                       generated
+    /// \param AllocaIP       Where to insert alloca instructions
+    /// \param CodeGenIP      Where to insert the region code
+    /// \param RegionName     Name to be used for new blocks
+    static void EmitOMPOutlinedRegionBody(CodeGenFunction &CGF,
+                                          const Stmt *RegionBodyStmt,
+                                          InsertPointTy AllocaIP,
+                                          InsertPointTy CodeGenIP,
+                                          Twine RegionName);
+
     /// RAII for preserving necessary info during Outlined region body codegen.
     class OutlinedRegionBodyRAII {
 
       llvm::AssertingVH<llvm::Instruction> OldAllocaIP;
       CodeGenFunction::JumpDest OldReturnBlock;
-      CGBuilderTy::InsertPoint IP;
       CodeGenFunction &CGF;
 
     public:
@@ -1846,7 +1850,6 @@
                "Must specify Insertion point for allocas of outlined function");
         OldAllocaIP = CGF.AllocaInsertPt;
         CGF.AllocaInsertPt = &*AllocaIP.getPoint();
-        IP = CGF.Builder.saveIP();
 
         OldReturnBlock = CGF.ReturnBlock;
         CGF.ReturnBlock = CGF.getJumpDestInCurrentScope(&RetBB);
@@ -1855,7 +1858,6 @@
       ~OutlinedRegionBodyRAII() {
         CGF.AllocaInsertPt = OldAllocaIP;
         CGF.ReturnBlock = OldReturnBlock;
-        CGF.Builder.restoreIP(IP);
       }
     };
 
diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -1325,6 +1325,14 @@
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[ARGV_ADDR:%.*]] = alloca i8**, align 8
+// CHECK3-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_LASTITER28:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_LOWERBOUND29:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_UPPERBOUND30:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[P_STRIDE31:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
@@ -1341,14 +1349,6 @@
 // CHECK3-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
 // CHECK3-NEXT:    store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-// CHECK3-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[P_LASTITER27:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[P_LOWERBOUND28:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[P_UPPERBOUND29:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[P_STRIDE30:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK3:       omp_parallel:
 // CHECK3-NEXT:    [[GEP_ARGC_ADDR:%.*]] = getelementptr { i32*, i8*** }, { i32*, i8*** }* [[STRUCTARG]], i32 0, i32 0
@@ -1382,9 +1382,20 @@
 // CHECK3-NEXT:    [[TMP4:%.*]] = add i32 [[OMP_SECTION_LOOP_IV]], [[TMP0]]
 // CHECK3-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 1
 // CHECK3-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-// CHECK3-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_INC]] [
+// CHECK3-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER:%.*]] [
 // CHECK3-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE:%.*]]
 // CHECK3-NEXT:    ]
+// CHECK3:       omp_section_loop.body.case:
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
+// CHECK3-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+// CHECK3-NEXT:    br i1 [[TMP8]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
+// CHECK3:       omp_section_loop.body.case.split:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE_SECTION_AFTER:%.*]]
+// CHECK3:       omp_section_loop.body.case.section.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER]]
+// CHECK3:       omp_section_loop.body.sections.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_INC]]
 // CHECK3:       omp_section_loop.inc:
 // CHECK3-NEXT:    [[OMP_SECTION_LOOP_NEXT]] = add nuw i32 [[OMP_SECTION_LOOP_IV]], 1
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER]]
@@ -1394,125 +1405,126 @@
 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER:%.*]]
 // CHECK3:       omp_section_loop.after:
-// CHECK3-NEXT:    br label [[OMP_SECTIONS_END:%.*]]
-// CHECK3:       omp_sections.end:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTERSECTIONS_FINI:%.*]]
+// CHECK3:       omp_section_loop.aftersections.fini:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]]
 // CHECK3:       omp_section_loop.preheader13:
-// CHECK3-NEXT:    store i32 0, i32* [[P_LOWERBOUND28]], align 4
-// CHECK3-NEXT:    store i32 1, i32* [[P_UPPERBOUND29]], align 4
-// CHECK3-NEXT:    store i32 1, i32* [[P_STRIDE30]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
-// CHECK3-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 1
+// CHECK3-NEXT:    store i32 0, i32* [[P_LOWERBOUND29]], align 4
+// CHECK3-NEXT:    store i32 1, i32* [[P_UPPERBOUND30]], align 4
+// CHECK3-NEXT:    store i32 1, i32* [[P_STRIDE31]], align 4
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]], i32 34, i32* [[P_LASTITER28]], i32* [[P_LOWERBOUND29]], i32* [[P_UPPERBOUND30]], i32* [[P_STRIDE31]], i32 1, i32 0)
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[P_LOWERBOUND29]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[P_UPPERBOUND30]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], [[TMP9]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 1
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14:%.*]]
 // CHECK3:       omp_section_loop.header14:
 // CHECK3-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER13]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_COND15:%.*]]
 // CHECK3:       omp_section_loop.cond15:
-// CHECK3-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP10]]
+// CHECK3-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP12]]
 // CHECK3-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY16:%.*]], label [[OMP_SECTION_LOOP_EXIT18:%.*]]
 // CHECK3:       omp_section_loop.body16:
-// CHECK3-NEXT:    [[TMP11:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP7]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 1
-// CHECK3-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
-// CHECK3-NEXT:    switch i32 [[TMP13]], label [[OMP_SECTION_LOOP_INC17]] [
+// CHECK3-NEXT:    [[TMP13:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP9]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 1
+// CHECK3-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+// CHECK3-NEXT:    switch i32 [[TMP15]], label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER:%.*]] [
 // CHECK3-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE23:%.*]]
 // CHECK3-NEXT:    i32 1, label [[OMP_SECTION_LOOP_BODY_CASE25:%.*]]
 // CHECK3-NEXT:    ]
+// CHECK3:       omp_section_loop.body.case23:
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
+// CHECK3-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
+// CHECK3-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
+// CHECK3:       omp_section_loop.body.case23.split:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]]
+// CHECK3:       omp_section_loop.body.case23.section.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK3:       omp_section_loop.body.case25:
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3)
+// CHECK3-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
+// CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
+// CHECK3:       omp_section_loop.body.case25.split:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]]
+// CHECK3:       omp_section_loop.body.case25.section.after26:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]]
+// CHECK3:       omp_section_loop.body.case25.section.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK3:       omp_section_loop.body16.sections.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_INC17]]
 // CHECK3:       omp_section_loop.inc17:
 // CHECK3-NEXT:    [[OMP_SECTION_LOOP_NEXT22]] = add nuw i32 [[OMP_SECTION_LOOP_IV20]], 1
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14]]
 // CHECK3:       omp_section_loop.exit18:
-// CHECK3-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]])
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK3-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19:%.*]]
 // CHECK3:       omp_section_loop.after19:
-// CHECK3-NEXT:    br label [[OMP_SECTIONS_END33:%.*]]
-// CHECK3:       omp_sections.end33:
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
-// CHECK3-NEXT:    store i32 [[TMP14]], i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP15]], 0
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19SECTIONS_FINI:%.*]]
+// CHECK3:       omp_section_loop.after19sections.fini:
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP20]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK3-NEXT:    [[SUB35:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK3-NEXT:    store i32 [[SUB35]], i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[I]], align 4
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP16]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP22]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK3:       omp.precond.then:
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK3-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK3-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM37:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM37]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK3-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK3-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
 // CHECK3-NEXT:    br i1 [[CMP38]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK3:       cond.true:
-// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK3-NEXT:    br label [[COND_END:%.*]]
 // CHECK3:       cond.false:
-// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    br label [[COND_END]]
 // CHECK3:       cond.end:
-// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP20]], [[COND_TRUE]] ], [ [[TMP21]], [[COND_FALSE]] ]
+// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP26]], [[COND_TRUE]] ], [ [[TMP27]], [[COND_FALSE]] ]
 // CHECK3-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
-// CHECK3-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
-// CHECK3-NEXT:    store i32 [[TMP22]], i32* [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK3-NEXT:    store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK3-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP23]], [[TMP24]]
+// CHECK3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP29]], [[TMP30]]
 // CHECK3-NEXT:    br i1 [[CMP39]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP25]], 1
+// CHECK3-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP31]], 1
 // CHECK3-NEXT:    [[ADD40:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK3-NEXT:    store i32 [[ADD40]], i32* [[I36]], align 4
-// CHECK3-NEXT:    [[TMP26:%.*]] = load float, float* @flag, align 4
-// CHECK3-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP26]], 0.000000e+00
+// CHECK3-NEXT:    [[TMP32:%.*]] = load float, float* @flag, align 4
+// CHECK3-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP32]], 0.000000e+00
 // CHECK3-NEXT:    br i1 [[TOBOOL41]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // CHECK3:       omp_if.then:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM42:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
-// CHECK3-NEXT:    [[TMP27:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
-// CHECK3-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
-// CHECK3-NEXT:    br i1 [[TMP28]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
+// CHECK3-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
+// CHECK3-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK3-NEXT:    br i1 [[TMP34]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
 // CHECK3:       .cancel.exit:
 // CHECK3-NEXT:    br label [[CANCEL_EXIT:%.*]]
-// CHECK3:       omp_section_loop.body.case:
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-NEXT:    [[TMP29:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
-// CHECK3-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP29]], 0
-// CHECK3-NEXT:    br i1 [[TMP30]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case.split:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
 // CHECK3:       omp_section_loop.body.case.cncl:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
-// CHECK3:       omp_section_loop.body.case23:
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-NEXT:    [[TMP31:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
-// CHECK3-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
-// CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case23.split:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK3:       omp_section_loop.body.case23.cncl:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
-// CHECK3:       omp_section_loop.body.case25:
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM26:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM26]], i32 3)
-// CHECK3-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP33]], 0
-// CHECK3-NEXT:    br i1 [[TMP34]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case25.split:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK3:       omp_section_loop.body.case25.cncl:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK3:       .cancel.continue:
@@ -1600,6 +1612,8 @@
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
 // CHECK3-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK3-NEXT:    store i8 [[CONV9]], i8* [[ARRAYIDX7]], align 1
+// CHECK3-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]]
+// CHECK3:       omp.par.region.parallel.after:
 // CHECK3-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK3:       omp.par.pre_finalize:
 // CHECK3-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
@@ -1780,6 +1794,8 @@
 // CHECK3-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
 // CHECK3-NEXT:    br i1 [[TMP10]], label [[DOTOMP_SECTIONS_CASE2_SPLIT:%.*]], label [[DOTOMP_SECTIONS_CASE2_CNCL:%.*]]
 // CHECK3:       .omp.sections.case2.split:
+// CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]]
+// CHECK3:       .omp.sections.case2.section.after:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK3:       .omp.sections.case2.cncl:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_END]]
@@ -1961,6 +1977,14 @@
 // CHECK4-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[ARGV_ADDR:%.*]] = alloca i8**, align 8
+// CHECK4-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    [[P_LASTITER28:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    [[P_LOWERBOUND29:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    [[P_UPPERBOUND30:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    [[P_STRIDE31:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
@@ -1977,14 +2001,6 @@
 // CHECK4-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
 // CHECK4-NEXT:    store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8
 // CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-// CHECK4-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[P_LASTITER27:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[P_LOWERBOUND28:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[P_UPPERBOUND29:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[P_STRIDE30:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK4:       omp_parallel:
 // CHECK4-NEXT:    [[GEP_ARGC_ADDR:%.*]] = getelementptr { i32*, i8*** }, { i32*, i8*** }* [[STRUCTARG]], i32 0, i32 0
@@ -2018,9 +2034,20 @@
 // CHECK4-NEXT:    [[TMP4:%.*]] = add i32 [[OMP_SECTION_LOOP_IV]], [[TMP0]]
 // CHECK4-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 1
 // CHECK4-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-// CHECK4-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_INC]] [
+// CHECK4-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER:%.*]] [
 // CHECK4-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE:%.*]]
 // CHECK4-NEXT:    ]
+// CHECK4:       omp_section_loop.body.case:
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
+// CHECK4-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+// CHECK4-NEXT:    br i1 [[TMP8]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
+// CHECK4:       omp_section_loop.body.case.split:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE_SECTION_AFTER:%.*]]
+// CHECK4:       omp_section_loop.body.case.section.after:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER]]
+// CHECK4:       omp_section_loop.body.sections.after:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_INC]]
 // CHECK4:       omp_section_loop.inc:
 // CHECK4-NEXT:    [[OMP_SECTION_LOOP_NEXT]] = add nuw i32 [[OMP_SECTION_LOOP_IV]], 1
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_HEADER]]
@@ -2030,125 +2057,126 @@
 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_AFTER:%.*]]
 // CHECK4:       omp_section_loop.after:
-// CHECK4-NEXT:    br label [[OMP_SECTIONS_END:%.*]]
-// CHECK4:       omp_sections.end:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_AFTERSECTIONS_FINI:%.*]]
+// CHECK4:       omp_section_loop.aftersections.fini:
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]]
 // CHECK4:       omp_section_loop.preheader13:
-// CHECK4-NEXT:    store i32 0, i32* [[P_LOWERBOUND28]], align 4
-// CHECK4-NEXT:    store i32 1, i32* [[P_UPPERBOUND29]], align 4
-// CHECK4-NEXT:    store i32 1, i32* [[P_STRIDE30]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
-// CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
-// CHECK4-NEXT:    [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
-// CHECK4-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
-// CHECK4-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 1
+// CHECK4-NEXT:    store i32 0, i32* [[P_LOWERBOUND29]], align 4
+// CHECK4-NEXT:    store i32 1, i32* [[P_UPPERBOUND30]], align 4
+// CHECK4-NEXT:    store i32 1, i32* [[P_STRIDE31]], align 4
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]], i32 34, i32* [[P_LASTITER28]], i32* [[P_LOWERBOUND29]], i32* [[P_UPPERBOUND30]], i32* [[P_STRIDE31]], i32 1, i32 0)
+// CHECK4-NEXT:    [[TMP9:%.*]] = load i32, i32* [[P_LOWERBOUND29]], align 4
+// CHECK4-NEXT:    [[TMP10:%.*]] = load i32, i32* [[P_UPPERBOUND30]], align 4
+// CHECK4-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], [[TMP9]]
+// CHECK4-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 1
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14:%.*]]
 // CHECK4:       omp_section_loop.header14:
 // CHECK4-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER13]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_COND15:%.*]]
 // CHECK4:       omp_section_loop.cond15:
-// CHECK4-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP10]]
+// CHECK4-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP12]]
 // CHECK4-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY16:%.*]], label [[OMP_SECTION_LOOP_EXIT18:%.*]]
 // CHECK4:       omp_section_loop.body16:
-// CHECK4-NEXT:    [[TMP11:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP7]]
-// CHECK4-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 1
-// CHECK4-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
-// CHECK4-NEXT:    switch i32 [[TMP13]], label [[OMP_SECTION_LOOP_INC17]] [
+// CHECK4-NEXT:    [[TMP13:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP9]]
+// CHECK4-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 1
+// CHECK4-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+// CHECK4-NEXT:    switch i32 [[TMP15]], label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER:%.*]] [
 // CHECK4-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE23:%.*]]
 // CHECK4-NEXT:    i32 1, label [[OMP_SECTION_LOOP_BODY_CASE25:%.*]]
 // CHECK4-NEXT:    ]
+// CHECK4:       omp_section_loop.body.case23:
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
+// CHECK4-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
+// CHECK4-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
+// CHECK4:       omp_section_loop.body.case23.split:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]]
+// CHECK4:       omp_section_loop.body.case23.section.after:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK4:       omp_section_loop.body.case25:
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3)
+// CHECK4-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
+// CHECK4-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
+// CHECK4:       omp_section_loop.body.case25.split:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]]
+// CHECK4:       omp_section_loop.body.case25.section.after26:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]]
+// CHECK4:       omp_section_loop.body.case25.section.after:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK4:       omp_section_loop.body16.sections.after:
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_INC17]]
 // CHECK4:       omp_section_loop.inc17:
 // CHECK4-NEXT:    [[OMP_SECTION_LOOP_NEXT22]] = add nuw i32 [[OMP_SECTION_LOOP_IV20]], 1
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14]]
 // CHECK4:       omp_section_loop.exit18:
-// CHECK4-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]])
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK4-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19:%.*]]
 // CHECK4:       omp_section_loop.after19:
-// CHECK4-NEXT:    br label [[OMP_SECTIONS_END33:%.*]]
-// CHECK4:       omp_sections.end33:
-// CHECK4-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
-// CHECK4-NEXT:    store i32 [[TMP14]], i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK4-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP15]], 0
+// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19SECTIONS_FINI:%.*]]
+// CHECK4:       omp_section_loop.after19sections.fini:
+// CHECK4-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
+// CHECK4-NEXT:    store i32 [[TMP20]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP21]], 0
 // CHECK4-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK4-NEXT:    [[SUB35:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK4-NEXT:    store i32 [[SUB35]], i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[I]], align 4
-// CHECK4-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK4-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP16]]
+// CHECK4-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP22]]
 // CHECK4-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK4:       omp.precond.then:
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
-// CHECK4-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK4-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_UB]], align 4
+// CHECK4-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK4-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
 // CHECK4-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM37:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM37]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK4-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK4-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK4-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
+// CHECK4-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK4-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK4-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
 // CHECK4-NEXT:    br i1 [[CMP38]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK4:       cond.true:
-// CHECK4-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK4-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK4-NEXT:    br label [[COND_END:%.*]]
 // CHECK4:       cond.false:
-// CHECK4-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK4-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
 // CHECK4-NEXT:    br label [[COND_END]]
 // CHECK4:       cond.end:
-// CHECK4-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP20]], [[COND_TRUE]] ], [ [[TMP21]], [[COND_FALSE]] ]
+// CHECK4-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP26]], [[COND_TRUE]] ], [ [[TMP27]], [[COND_FALSE]] ]
 // CHECK4-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
-// CHECK4-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
-// CHECK4-NEXT:    store i32 [[TMP22]], i32* [[DOTOMP_IV]], align 4
+// CHECK4-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK4-NEXT:    store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK4:       omp.inner.for.cond:
-// CHECK4-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK4-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK4-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP23]], [[TMP24]]
+// CHECK4-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK4-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP29]], [[TMP30]]
 // CHECK4-NEXT:    br i1 [[CMP39]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK4:       omp.inner.for.body:
-// CHECK4-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK4-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP25]], 1
+// CHECK4-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP31]], 1
 // CHECK4-NEXT:    [[ADD40:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK4-NEXT:    store i32 [[ADD40]], i32* [[I36]], align 4
-// CHECK4-NEXT:    [[TMP26:%.*]] = load float, float* @flag, align 4
-// CHECK4-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP26]], 0.000000e+00
+// CHECK4-NEXT:    [[TMP32:%.*]] = load float, float* @flag, align 4
+// CHECK4-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP32]], 0.000000e+00
 // CHECK4-NEXT:    br i1 [[TOBOOL41]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // CHECK4:       omp_if.then:
 // CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM42:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
-// CHECK4-NEXT:    [[TMP27:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
-// CHECK4-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
-// CHECK4-NEXT:    br i1 [[TMP28]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
+// CHECK4-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
+// CHECK4-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK4-NEXT:    br i1 [[TMP34]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
 // CHECK4:       .cancel.exit:
 // CHECK4-NEXT:    br label [[CANCEL_EXIT:%.*]]
-// CHECK4:       omp_section_loop.body.case:
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-NEXT:    [[TMP29:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
-// CHECK4-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP29]], 0
-// CHECK4-NEXT:    br i1 [[TMP30]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
-// CHECK4:       omp_section_loop.body.case.split:
-// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
 // CHECK4:       omp_section_loop.body.case.cncl:
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
-// CHECK4:       omp_section_loop.body.case23:
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-NEXT:    [[TMP31:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
-// CHECK4-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
-// CHECK4-NEXT:    br i1 [[TMP32]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
-// CHECK4:       omp_section_loop.body.case23.split:
-// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK4:       omp_section_loop.body.case23.cncl:
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
-// CHECK4:       omp_section_loop.body.case25:
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM26:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM26]], i32 3)
-// CHECK4-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP33]], 0
-// CHECK4-NEXT:    br i1 [[TMP34]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
-// CHECK4:       omp_section_loop.body.case25.split:
-// CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK4:       omp_section_loop.body.case25.cncl:
 // CHECK4-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK4:       .cancel.continue:
@@ -2236,6 +2264,8 @@
 // CHECK4-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
 // CHECK4-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK4-NEXT:    store i8 [[CONV9]], i8* [[ARRAYIDX7]], align 1
+// CHECK4-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]]
+// CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
@@ -2416,6 +2446,8 @@
 // CHECK4-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
 // CHECK4-NEXT:    br i1 [[TMP10]], label [[DOTOMP_SECTIONS_CASE2_SPLIT:%.*]], label [[DOTOMP_SECTIONS_CASE2_CNCL:%.*]]
 // CHECK4:       .omp.sections.case2.split:
+// CHECK4-NEXT:    br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]]
+// CHECK4:       .omp.sections.case2.section.after:
 // CHECK4-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK4:       .omp.sections.case2.cncl:
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_END]]
@@ -3837,6 +3869,14 @@
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[ARGV_ADDR:%.*]] = alloca i8**, align 8
+// CHECK9-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[P_LASTITER28:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[P_LOWERBOUND29:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[P_UPPERBOUND30:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[P_STRIDE31:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
@@ -3853,14 +3893,6 @@
 // CHECK9-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
 // CHECK9-NEXT:    store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8
 // CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-// CHECK9-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
-// CHECK9-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
-// CHECK9-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
-// CHECK9-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
-// CHECK9-NEXT:    [[P_LASTITER27:%.*]] = alloca i32, align 4
-// CHECK9-NEXT:    [[P_LOWERBOUND28:%.*]] = alloca i32, align 4
-// CHECK9-NEXT:    [[P_UPPERBOUND29:%.*]] = alloca i32, align 4
-// CHECK9-NEXT:    [[P_STRIDE30:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK9:       omp_parallel:
 // CHECK9-NEXT:    [[GEP_ARGC_ADDR:%.*]] = getelementptr { i32*, i8*** }, { i32*, i8*** }* [[STRUCTARG]], i32 0, i32 0
@@ -3894,9 +3926,20 @@
 // CHECK9-NEXT:    [[TMP4:%.*]] = add i32 [[OMP_SECTION_LOOP_IV]], [[TMP0]]
 // CHECK9-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 1
 // CHECK9-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-// CHECK9-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_INC]] [
+// CHECK9-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER:%.*]] [
 // CHECK9-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE:%.*]]
 // CHECK9-NEXT:    ]
+// CHECK9:       omp_section_loop.body.case:
+// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK9-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
+// CHECK9-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+// CHECK9-NEXT:    br i1 [[TMP8]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
+// CHECK9:       omp_section_loop.body.case.split:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE_SECTION_AFTER:%.*]]
+// CHECK9:       omp_section_loop.body.case.section.after:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER]]
+// CHECK9:       omp_section_loop.body.sections.after:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_INC]]
 // CHECK9:       omp_section_loop.inc:
 // CHECK9-NEXT:    [[OMP_SECTION_LOOP_NEXT]] = add nuw i32 [[OMP_SECTION_LOOP_IV]], 1
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_HEADER]]
@@ -3906,125 +3949,126 @@
 // CHECK9-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_AFTER:%.*]]
 // CHECK9:       omp_section_loop.after:
-// CHECK9-NEXT:    br label [[OMP_SECTIONS_END:%.*]]
-// CHECK9:       omp_sections.end:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_AFTERSECTIONS_FINI:%.*]]
+// CHECK9:       omp_section_loop.aftersections.fini:
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]]
 // CHECK9:       omp_section_loop.preheader13:
-// CHECK9-NEXT:    store i32 0, i32* [[P_LOWERBOUND28]], align 4
-// CHECK9-NEXT:    store i32 1, i32* [[P_UPPERBOUND29]], align 4
-// CHECK9-NEXT:    store i32 1, i32* [[P_STRIDE30]], align 4
-// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK9-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
-// CHECK9-NEXT:    [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
-// CHECK9-NEXT:    [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
-// CHECK9-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
-// CHECK9-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 1
+// CHECK9-NEXT:    store i32 0, i32* [[P_LOWERBOUND29]], align 4
+// CHECK9-NEXT:    store i32 1, i32* [[P_UPPERBOUND30]], align 4
+// CHECK9-NEXT:    store i32 1, i32* [[P_STRIDE31]], align 4
+// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK9-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]], i32 34, i32* [[P_LASTITER28]], i32* [[P_LOWERBOUND29]], i32* [[P_UPPERBOUND30]], i32* [[P_STRIDE31]], i32 1, i32 0)
+// CHECK9-NEXT:    [[TMP9:%.*]] = load i32, i32* [[P_LOWERBOUND29]], align 4
+// CHECK9-NEXT:    [[TMP10:%.*]] = load i32, i32* [[P_UPPERBOUND30]], align 4
+// CHECK9-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], [[TMP9]]
+// CHECK9-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 1
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14:%.*]]
 // CHECK9:       omp_section_loop.header14:
 // CHECK9-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER13]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_COND15:%.*]]
 // CHECK9:       omp_section_loop.cond15:
-// CHECK9-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP10]]
+// CHECK9-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP12]]
 // CHECK9-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY16:%.*]], label [[OMP_SECTION_LOOP_EXIT18:%.*]]
 // CHECK9:       omp_section_loop.body16:
-// CHECK9-NEXT:    [[TMP11:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP7]]
-// CHECK9-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 1
-// CHECK9-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
-// CHECK9-NEXT:    switch i32 [[TMP13]], label [[OMP_SECTION_LOOP_INC17]] [
+// CHECK9-NEXT:    [[TMP13:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP9]]
+// CHECK9-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 1
+// CHECK9-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+// CHECK9-NEXT:    switch i32 [[TMP15]], label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER:%.*]] [
 // CHECK9-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE23:%.*]]
 // CHECK9-NEXT:    i32 1, label [[OMP_SECTION_LOOP_BODY_CASE25:%.*]]
 // CHECK9-NEXT:    ]
+// CHECK9:       omp_section_loop.body.case23:
+// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK9-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
+// CHECK9-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
+// CHECK9-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
+// CHECK9:       omp_section_loop.body.case23.split:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]]
+// CHECK9:       omp_section_loop.body.case23.section.after:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK9:       omp_section_loop.body.case25:
+// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK9-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3)
+// CHECK9-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
+// CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
+// CHECK9:       omp_section_loop.body.case25.split:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]]
+// CHECK9:       omp_section_loop.body.case25.section.after26:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]]
+// CHECK9:       omp_section_loop.body.case25.section.after:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK9:       omp_section_loop.body16.sections.after:
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_INC17]]
 // CHECK9:       omp_section_loop.inc17:
 // CHECK9-NEXT:    [[OMP_SECTION_LOOP_NEXT22]] = add nuw i32 [[OMP_SECTION_LOOP_IV20]], 1
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14]]
 // CHECK9:       omp_section_loop.exit18:
-// CHECK9-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]])
-// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK9-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK9-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK9-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19:%.*]]
 // CHECK9:       omp_section_loop.after19:
-// CHECK9-NEXT:    br label [[OMP_SECTIONS_END33:%.*]]
-// CHECK9:       omp_sections.end33:
-// CHECK9-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
-// CHECK9-NEXT:    store i32 [[TMP14]], i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP15]], 0
+// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19SECTIONS_FINI:%.*]]
+// CHECK9:       omp_section_loop.after19sections.fini:
+// CHECK9-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP20]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP21]], 0
 // CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK9-NEXT:    [[SUB35:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK9-NEXT:    store i32 [[SUB35]], i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK9-NEXT:    store i32 0, i32* [[I]], align 4
-// CHECK9-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP16]]
+// CHECK9-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP22]]
 // CHECK9-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK9:       omp.precond.then:
 // CHECK9-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
-// CHECK9-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK9-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK9-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
 // CHECK9-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK9-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM37:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK9-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM37]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK9-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK9-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK9-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
+// CHECK9-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK9-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
 // CHECK9-NEXT:    br i1 [[CMP38]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK9:       cond.true:
-// CHECK9-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK9-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK9-NEXT:    br label [[COND_END:%.*]]
 // CHECK9:       cond.false:
-// CHECK9-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
 // CHECK9-NEXT:    br label [[COND_END]]
 // CHECK9:       cond.end:
-// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP20]], [[COND_TRUE]] ], [ [[TMP21]], [[COND_FALSE]] ]
+// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP26]], [[COND_TRUE]] ], [ [[TMP27]], [[COND_FALSE]] ]
 // CHECK9-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
-// CHECK9-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
-// CHECK9-NEXT:    store i32 [[TMP22]], i32* [[DOTOMP_IV]], align 4
+// CHECK9-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK9-NEXT:    store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4
 // CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK9:       omp.inner.for.cond:
-// CHECK9-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK9-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK9-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP23]], [[TMP24]]
+// CHECK9-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK9-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP29]], [[TMP30]]
 // CHECK9-NEXT:    br i1 [[CMP39]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK9:       omp.inner.for.body:
-// CHECK9-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP25]], 1
+// CHECK9-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP31]], 1
 // CHECK9-NEXT:    [[ADD40:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK9-NEXT:    store i32 [[ADD40]], i32* [[I36]], align 4
-// CHECK9-NEXT:    [[TMP26:%.*]] = load float, float* @flag, align 4
-// CHECK9-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP26]], 0.000000e+00
+// CHECK9-NEXT:    [[TMP32:%.*]] = load float, float* @flag, align 4
+// CHECK9-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP32]], 0.000000e+00
 // CHECK9-NEXT:    br i1 [[TOBOOL41]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // CHECK9:       omp_if.then:
 // CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM42:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
-// CHECK9-NEXT:    [[TMP27:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
-// CHECK9-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
-// CHECK9-NEXT:    br i1 [[TMP28]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
+// CHECK9-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
+// CHECK9-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK9-NEXT:    br i1 [[TMP34]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
 // CHECK9:       .cancel.exit:
 // CHECK9-NEXT:    br label [[CANCEL_EXIT:%.*]]
-// CHECK9:       omp_section_loop.body.case:
-// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK9-NEXT:    [[TMP29:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
-// CHECK9-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP29]], 0
-// CHECK9-NEXT:    br i1 [[TMP30]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
-// CHECK9:       omp_section_loop.body.case.split:
-// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
 // CHECK9:       omp_section_loop.body.case.cncl:
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
-// CHECK9:       omp_section_loop.body.case23:
-// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK9-NEXT:    [[TMP31:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
-// CHECK9-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
-// CHECK9-NEXT:    br i1 [[TMP32]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
-// CHECK9:       omp_section_loop.body.case23.split:
-// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK9:       omp_section_loop.body.case23.cncl:
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
-// CHECK9:       omp_section_loop.body.case25:
-// CHECK9-NEXT:    [[OMP_GLOBAL_THREAD_NUM26:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK9-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM26]], i32 3)
-// CHECK9-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP33]], 0
-// CHECK9-NEXT:    br i1 [[TMP34]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
-// CHECK9:       omp_section_loop.body.case25.split:
-// CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK9:       omp_section_loop.body.case25.cncl:
 // CHECK9-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK9:       .cancel.continue:
@@ -4112,6 +4156,8 @@
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
 // CHECK9-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK9-NEXT:    store i8 [[CONV9]], i8* [[ARRAYIDX7]], align 1
+// CHECK9-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]]
+// CHECK9:       omp.par.region.parallel.after:
 // CHECK9-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK9:       omp.par.pre_finalize:
 // CHECK9-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
@@ -4292,6 +4338,8 @@
 // CHECK9-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
 // CHECK9-NEXT:    br i1 [[TMP10]], label [[DOTOMP_SECTIONS_CASE2_SPLIT:%.*]], label [[DOTOMP_SECTIONS_CASE2_CNCL:%.*]]
 // CHECK9:       .omp.sections.case2.split:
+// CHECK9-NEXT:    br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]]
+// CHECK9:       .omp.sections.case2.section.after:
 // CHECK9-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK9:       .omp.sections.case2.cncl:
 // CHECK9-NEXT:    br label [[OMP_INNER_FOR_END]]
@@ -4473,6 +4521,14 @@
 // CHECK10-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    [[ARGV_ADDR:%.*]] = alloca i8**, align 8
+// CHECK10-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
+// CHECK10-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
+// CHECK10-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
+// CHECK10-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
+// CHECK10-NEXT:    [[P_LASTITER28:%.*]] = alloca i32, align 4
+// CHECK10-NEXT:    [[P_LOWERBOUND29:%.*]] = alloca i32, align 4
+// CHECK10-NEXT:    [[P_UPPERBOUND30:%.*]] = alloca i32, align 4
+// CHECK10-NEXT:    [[P_STRIDE31:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
@@ -4489,14 +4545,6 @@
 // CHECK10-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
 // CHECK10-NEXT:    store i8** [[ARGV]], i8*** [[ARGV_ADDR]], align 8
 // CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-// CHECK10-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
-// CHECK10-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
-// CHECK10-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
-// CHECK10-NEXT:    [[P_STRIDE:%.*]] = alloca i32, align 4
-// CHECK10-NEXT:    [[P_LASTITER27:%.*]] = alloca i32, align 4
-// CHECK10-NEXT:    [[P_LOWERBOUND28:%.*]] = alloca i32, align 4
-// CHECK10-NEXT:    [[P_UPPERBOUND29:%.*]] = alloca i32, align 4
-// CHECK10-NEXT:    [[P_STRIDE30:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK10:       omp_parallel:
 // CHECK10-NEXT:    [[GEP_ARGC_ADDR:%.*]] = getelementptr { i32*, i8*** }, { i32*, i8*** }* [[STRUCTARG]], i32 0, i32 0
@@ -4530,9 +4578,20 @@
 // CHECK10-NEXT:    [[TMP4:%.*]] = add i32 [[OMP_SECTION_LOOP_IV]], [[TMP0]]
 // CHECK10-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 1
 // CHECK10-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-// CHECK10-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_INC]] [
+// CHECK10-NEXT:    switch i32 [[TMP6]], label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER:%.*]] [
 // CHECK10-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE:%.*]]
 // CHECK10-NEXT:    ]
+// CHECK10:       omp_section_loop.body.case:
+// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK10-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
+// CHECK10-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+// CHECK10-NEXT:    br i1 [[TMP8]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
+// CHECK10:       omp_section_loop.body.case.split:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE_SECTION_AFTER:%.*]]
+// CHECK10:       omp_section_loop.body.case.section.after:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_BODY_SECTIONS_AFTER]]
+// CHECK10:       omp_section_loop.body.sections.after:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_INC]]
 // CHECK10:       omp_section_loop.inc:
 // CHECK10-NEXT:    [[OMP_SECTION_LOOP_NEXT]] = add nuw i32 [[OMP_SECTION_LOOP_IV]], 1
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_HEADER]]
@@ -4542,125 +4601,126 @@
 // CHECK10-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_AFTER:%.*]]
 // CHECK10:       omp_section_loop.after:
-// CHECK10-NEXT:    br label [[OMP_SECTIONS_END:%.*]]
-// CHECK10:       omp_sections.end:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_AFTERSECTIONS_FINI:%.*]]
+// CHECK10:       omp_section_loop.aftersections.fini:
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]]
 // CHECK10:       omp_section_loop.preheader13:
-// CHECK10-NEXT:    store i32 0, i32* [[P_LOWERBOUND28]], align 4
-// CHECK10-NEXT:    store i32 1, i32* [[P_UPPERBOUND29]], align 4
-// CHECK10-NEXT:    store i32 1, i32* [[P_STRIDE30]], align 4
-// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK10-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
-// CHECK10-NEXT:    [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
-// CHECK10-NEXT:    [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
-// CHECK10-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
-// CHECK10-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 1
+// CHECK10-NEXT:    store i32 0, i32* [[P_LOWERBOUND29]], align 4
+// CHECK10-NEXT:    store i32 1, i32* [[P_UPPERBOUND30]], align 4
+// CHECK10-NEXT:    store i32 1, i32* [[P_STRIDE31]], align 4
+// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK10-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]], i32 34, i32* [[P_LASTITER28]], i32* [[P_LOWERBOUND29]], i32* [[P_UPPERBOUND30]], i32* [[P_STRIDE31]], i32 1, i32 0)
+// CHECK10-NEXT:    [[TMP9:%.*]] = load i32, i32* [[P_LOWERBOUND29]], align 4
+// CHECK10-NEXT:    [[TMP10:%.*]] = load i32, i32* [[P_UPPERBOUND30]], align 4
+// CHECK10-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], [[TMP9]]
+// CHECK10-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 1
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14:%.*]]
 // CHECK10:       omp_section_loop.header14:
 // CHECK10-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER13]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_COND15:%.*]]
 // CHECK10:       omp_section_loop.cond15:
-// CHECK10-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP10]]
+// CHECK10-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP12]]
 // CHECK10-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY16:%.*]], label [[OMP_SECTION_LOOP_EXIT18:%.*]]
 // CHECK10:       omp_section_loop.body16:
-// CHECK10-NEXT:    [[TMP11:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP7]]
-// CHECK10-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 1
-// CHECK10-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
-// CHECK10-NEXT:    switch i32 [[TMP13]], label [[OMP_SECTION_LOOP_INC17]] [
+// CHECK10-NEXT:    [[TMP13:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP9]]
+// CHECK10-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 1
+// CHECK10-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+// CHECK10-NEXT:    switch i32 [[TMP15]], label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER:%.*]] [
 // CHECK10-NEXT:    i32 0, label [[OMP_SECTION_LOOP_BODY_CASE23:%.*]]
 // CHECK10-NEXT:    i32 1, label [[OMP_SECTION_LOOP_BODY_CASE25:%.*]]
 // CHECK10-NEXT:    ]
+// CHECK10:       omp_section_loop.body.case23:
+// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK10-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
+// CHECK10-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
+// CHECK10-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
+// CHECK10:       omp_section_loop.body.case23.split:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]]
+// CHECK10:       omp_section_loop.body.case23.section.after:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK10:       omp_section_loop.body.case25:
+// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK10-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3)
+// CHECK10-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
+// CHECK10-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
+// CHECK10:       omp_section_loop.body.case25.split:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]]
+// CHECK10:       omp_section_loop.body.case25.section.after26:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]]
+// CHECK10:       omp_section_loop.body.case25.section.after:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
+// CHECK10:       omp_section_loop.body16.sections.after:
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_INC17]]
 // CHECK10:       omp_section_loop.inc17:
 // CHECK10-NEXT:    [[OMP_SECTION_LOOP_NEXT22]] = add nuw i32 [[OMP_SECTION_LOOP_IV20]], 1
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14]]
 // CHECK10:       omp_section_loop.exit18:
-// CHECK10-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]])
-// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM32:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK10-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK10-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
+// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK10-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19:%.*]]
 // CHECK10:       omp_section_loop.after19:
-// CHECK10-NEXT:    br label [[OMP_SECTIONS_END33:%.*]]
-// CHECK10:       omp_sections.end33:
-// CHECK10-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
-// CHECK10-NEXT:    store i32 [[TMP14]], i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK10-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK10-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP15]], 0
+// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19SECTIONS_FINI:%.*]]
+// CHECK10:       omp_section_loop.after19sections.fini:
+// CHECK10-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
+// CHECK10-NEXT:    store i32 [[TMP20]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP21]], 0
 // CHECK10-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK10-NEXT:    [[SUB35:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK10-NEXT:    store i32 [[SUB35]], i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK10-NEXT:    store i32 0, i32* [[I]], align 4
-// CHECK10-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK10-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP16]]
+// CHECK10-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP22]]
 // CHECK10-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK10:       omp.precond.then:
 // CHECK10-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
-// CHECK10-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK10-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_UB]], align 4
+// CHECK10-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK10-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
 // CHECK10-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK10-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM37:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK10-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM37]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK10-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK10-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
-// CHECK10-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
+// CHECK10-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK10-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK10-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
 // CHECK10-NEXT:    br i1 [[CMP38]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK10:       cond.true:
-// CHECK10-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
+// CHECK10-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_34]], align 4
 // CHECK10-NEXT:    br label [[COND_END:%.*]]
 // CHECK10:       cond.false:
-// CHECK10-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK10-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
 // CHECK10-NEXT:    br label [[COND_END]]
 // CHECK10:       cond.end:
-// CHECK10-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP20]], [[COND_TRUE]] ], [ [[TMP21]], [[COND_FALSE]] ]
+// CHECK10-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP26]], [[COND_TRUE]] ], [ [[TMP27]], [[COND_FALSE]] ]
 // CHECK10-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
-// CHECK10-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
-// CHECK10-NEXT:    store i32 [[TMP22]], i32* [[DOTOMP_IV]], align 4
+// CHECK10-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK10-NEXT:    store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4
 // CHECK10-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK10:       omp.inner.for.cond:
-// CHECK10-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK10-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
-// CHECK10-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP23]], [[TMP24]]
+// CHECK10-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK10-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK10-NEXT:    [[CMP39:%.*]] = icmp sle i32 [[TMP29]], [[TMP30]]
 // CHECK10-NEXT:    br i1 [[CMP39]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK10:       omp.inner.for.body:
-// CHECK10-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
-// CHECK10-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP25]], 1
+// CHECK10-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK10-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP31]], 1
 // CHECK10-NEXT:    [[ADD40:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK10-NEXT:    store i32 [[ADD40]], i32* [[I36]], align 4
-// CHECK10-NEXT:    [[TMP26:%.*]] = load float, float* @flag, align 4
-// CHECK10-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP26]], 0.000000e+00
+// CHECK10-NEXT:    [[TMP32:%.*]] = load float, float* @flag, align 4
+// CHECK10-NEXT:    [[TOBOOL41:%.*]] = fcmp une float [[TMP32]], 0.000000e+00
 // CHECK10-NEXT:    br i1 [[TOBOOL41]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // CHECK10:       omp_if.then:
 // CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM42:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
-// CHECK10-NEXT:    [[TMP27:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
-// CHECK10-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
-// CHECK10-NEXT:    br i1 [[TMP28]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
+// CHECK10-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM42]], i32 2)
+// CHECK10-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK10-NEXT:    br i1 [[TMP34]], label [[DOTCANCEL_EXIT:%.*]], label [[DOTCANCEL_CONTINUE:%.*]]
 // CHECK10:       .cancel.exit:
 // CHECK10-NEXT:    br label [[CANCEL_EXIT:%.*]]
-// CHECK10:       omp_section_loop.body.case:
-// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK10-NEXT:    [[TMP29:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32 3)
-// CHECK10-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP29]], 0
-// CHECK10-NEXT:    br i1 [[TMP30]], label [[OMP_SECTION_LOOP_BODY_CASE_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE_CNCL:%.*]]
-// CHECK10:       omp_section_loop.body.case.split:
-// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
 // CHECK10:       omp_section_loop.body.case.cncl:
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
-// CHECK10:       omp_section_loop.body.case23:
-// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK10-NEXT:    [[TMP31:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
-// CHECK10-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
-// CHECK10-NEXT:    br i1 [[TMP32]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
-// CHECK10:       omp_section_loop.body.case23.split:
-// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK10:       omp_section_loop.body.case23.cncl:
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
-// CHECK10:       omp_section_loop.body.case25:
-// CHECK10-NEXT:    [[OMP_GLOBAL_THREAD_NUM26:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK10-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM26]], i32 3)
-// CHECK10-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP33]], 0
-// CHECK10-NEXT:    br i1 [[TMP34]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
-// CHECK10:       omp_section_loop.body.case25.split:
-// CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK10:       omp_section_loop.body.case25.cncl:
 // CHECK10-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
 // CHECK10:       .cancel.continue:
@@ -4748,6 +4808,8 @@
 // CHECK10-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV8]], [[TMP10]]
 // CHECK10-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD]] to i8
 // CHECK10-NEXT:    store i8 [[CONV9]], i8* [[ARRAYIDX7]], align 1
+// CHECK10-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]]
+// CHECK10:       omp.par.region.parallel.after:
 // CHECK10-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK10:       omp.par.pre_finalize:
 // CHECK10-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
@@ -4928,6 +4990,8 @@
 // CHECK10-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
 // CHECK10-NEXT:    br i1 [[TMP10]], label [[DOTOMP_SECTIONS_CASE2_SPLIT:%.*]], label [[DOTOMP_SECTIONS_CASE2_CNCL:%.*]]
 // CHECK10:       .omp.sections.case2.split:
+// CHECK10-NEXT:    br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]]
+// CHECK10:       .omp.sections.case2.section.after:
 // CHECK10-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK10:       .omp.sections.case2.cncl:
 // CHECK10-NEXT:    br label [[OMP_INNER_FOR_END]]
diff --git a/clang/test/OpenMP/critical_codegen.cpp b/clang/test/OpenMP/critical_codegen.cpp
--- a/clang/test/OpenMP/critical_codegen.cpp
+++ b/clang/test/OpenMP/critical_codegen.cpp
@@ -33,6 +33,8 @@
 // ALL:       			[[GTID:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:@.+]])
 // ALL:       			call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[UNNAMED_LOCK]])
 // ALL-NEXT:  			store i8 2, i8* [[A_ADDR]]
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[UNNAMED_LOCK]])
 #pragma omp critical
   a = 2;
@@ -40,6 +42,8 @@
 // ALL:       			call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
 // IRBUILDER-NEXT:	call {{.*}}void [[FOO]]()
 // NORMAL-NEXT:  		invoke {{.*}}void [[FOO]]()
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL:      				call {{.*}}void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
 #pragma omp critical(the_name)
   foo();
@@ -47,15 +51,17 @@
 // ALL: 	      		call {{.*}}void @__kmpc_critical_with_hint([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK1]], i{{64|32}} 23)
 // IRBUILDER-NEXT:	call {{.*}}void [[FOO]]()
 // NORMAL-NEXT:		  invoke {{.*}}void [[FOO]]()
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL:		       		call {{.*}}void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK1]])
 #pragma omp critical(the_name1) hint(23)
   foo();
   // IRBUILDER:   		[[GTID:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:@.+]])
   // ALL:       call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
-  // ALL:       br label
-  // ALL-NOT:   call {{.*}}void @__kmpc_end_critical(
-  // ALL:       br label
-  // ALL-NOT:   call {{.*}}void @__kmpc_end_critical(
+  // NORMAL:       br label
+  // NORMAL-NOT:   call {{.*}}void @__kmpc_end_critical(
+  // NORMAL:       br label
+  // NORMAL-NOT:   call {{.*}}void @__kmpc_end_critical(
   // NORMAL:       br label
   if (a)
 #pragma omp critical(the_name)
diff --git a/clang/test/OpenMP/critical_codegen_attr.cpp b/clang/test/OpenMP/critical_codegen_attr.cpp
--- a/clang/test/OpenMP/critical_codegen_attr.cpp
+++ b/clang/test/OpenMP/critical_codegen_attr.cpp
@@ -33,6 +33,8 @@
 // ALL:       			[[GTID:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:@.+]])
 // ALL:       			call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[UNNAMED_LOCK]])
 // ALL-NEXT:  			store i8 2, i8* [[A_ADDR]]
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[UNNAMED_LOCK]])
   [[omp::directive(critical)]]
   a = 2;
@@ -40,6 +42,8 @@
 // ALL:       			call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
 // IRBUILDER-NEXT:	call {{.*}}void [[FOO]]()
 // NORMAL-NEXT:  		invoke {{.*}}void [[FOO]]()
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL:      				call {{.*}}void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
   [[omp::directive(critical(the_name))]]
   foo();
@@ -47,15 +51,17 @@
 // ALL: 	      		call {{.*}}void @__kmpc_critical_with_hint([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK1]], i{{64|32}} 23)
 // IRBUILDER-NEXT:	call {{.*}}void [[FOO]]()
 // NORMAL-NEXT:		  invoke {{.*}}void [[FOO]]()
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL:		       		call {{.*}}void @__kmpc_end_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK1]])
   [[omp::directive(critical(the_name1) hint(23))]]
   foo();
   // IRBUILDER:   		[[GTID:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:@.+]])
   // ALL:       call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
-  // ALL:       br label
-  // ALL-NOT:   call {{.*}}void @__kmpc_end_critical(
-  // ALL:       br label
-  // ALL-NOT:   call {{.*}}void @__kmpc_end_critical(
+  // NORMAL:       br label
+  // NORMAL-NOT:   call {{.*}}void @__kmpc_end_critical(
+  // NORMAL:       br label
+  // NORMAL-NOT:   call {{.*}}void @__kmpc_end_critical(
   // NORMAL:       br label
   if (a)
     [[omp::directive(critical(the_name))]]
diff --git a/clang/test/OpenMP/masked_codegen.cpp b/clang/test/OpenMP/masked_codegen.cpp
--- a/clang/test/OpenMP/masked_codegen.cpp
+++ b/clang/test/OpenMP/masked_codegen.cpp
@@ -33,6 +33,8 @@
 // ALL-NEXT:  			br i1 [[IS_MASKED]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
 // ALL:       			[[THEN]]
 // ALL-NEXT:  			store i8 2, i8* [[A_ADDR]]
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_masked([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
 // ALL-NEXT:  			br label {{%?}}[[EXIT]]
 // ALL:       			[[EXIT]]
diff --git a/clang/test/OpenMP/master_codegen.cpp b/clang/test/OpenMP/master_codegen.cpp
--- a/clang/test/OpenMP/master_codegen.cpp
+++ b/clang/test/OpenMP/master_codegen.cpp
@@ -33,6 +33,8 @@
 // ALL-NEXT:  			br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
 // ALL:       			[[THEN]]
 // ALL-NEXT:  			store i8 2, i8* [[A_ADDR]]
+// IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
+// IRBUILDER:			[[AFTER]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_master([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
 // ALL-NEXT:  			br label {{%?}}[[EXIT]]
 // ALL:       			[[EXIT]]
diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp
--- a/clang/test/OpenMP/ordered_codegen.cpp
+++ b/clang/test/OpenMP/ordered_codegen.cpp
@@ -1,19 +1,19 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK1,CHECK1-NORMAL
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK1
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK2,CHECK2-NORMAL
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK2
 
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK1,CHECK1-IRBUILDER
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK1-IRBUILDER
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK2,CHECK2-IRBUILDER
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK2-IRBUILDER
 
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -fopenmp-version=45 -o - | FileCheck %s --check-prefixes=CHECK3,CHECK3-NORMAL
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -fopenmp-version=45 -o - | FileCheck %s --check-prefixes=CHECK3
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -fopenmp-version=45 -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK4,CHECK4-NORMAL
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK4
 
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -fopenmp-version=45 -o - | FileCheck %s --check-prefixes=CHECK3,CHECK3-IRBUILDER
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -fopenmp-version=45 -o - | FileCheck %s --check-prefixes=CHECK3-IRBUILDER
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -fopenmp-version=45 -o %t %s
-// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-enable-irbuilder -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK4,CHECK4-IRBUILDER
+// RUN: %clang_cc1 -no-opaque-pointers -fopenmp -fopenmp-enable-irbuilder -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK4-IRBUILDER
 
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK5
 // RUN: %clang_cc1 -no-opaque-pointers -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
@@ -136,7 +136,7 @@
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -145,12 +145,9 @@
 // CHECK1-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
-// CHECK1-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK1-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -168,7 +165,6 @@
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK1-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -200,7 +196,6 @@
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -208,9 +203,7 @@
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
-// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -228,7 +221,7 @@
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -237,11 +230,9 @@
 // CHECK1-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK1-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741891, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -260,7 +251,6 @@
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK1-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK1-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -288,7 +278,6 @@
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK1-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -296,9 +285,7 @@
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -324,7 +311,7 @@
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK1-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -357,11 +344,9 @@
 // CHECK1-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741894, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK1-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -392,7 +377,6 @@
 // CHECK1-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK1-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK1-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -424,7 +408,6 @@
 // CHECK1-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK1-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -434,9 +417,7 @@
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK1:       omp.precond.end:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -457,7 +438,7 @@
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK1-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -467,11 +448,9 @@
 // CHECK1-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741893, i32 0, i32 199, i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -499,7 +478,6 @@
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK1-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -531,7 +509,6 @@
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK1-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -539,9 +516,7 @@
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -568,7 +543,7 @@
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -655,11 +630,9 @@
 // CHECK1-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK1-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -667,15 +640,13 @@
 // CHECK1-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK1-IRBUILDER:       omp.inner.for.cond30:
-// CHECK1-NORMAL:       omp.inner.for.cond29:
+// CHECK1:       omp.inner.for.cond29:
 // CHECK1-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK1-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK1-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK1-IRBUILDER:       omp.inner.for.body33:
-// CHECK1-NORMAL:       omp.inner.for.body32:
+// CHECK1:       omp.inner.for.body32:
 // CHECK1-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -687,19 +658,15 @@
 // CHECK1-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK1-IRBUILDER:       omp.body.continue38:
-// CHECK1-NORMAL:       omp.body.continue37:
+// CHECK1:       omp.body.continue37:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK1-IRBUILDER:       omp.inner.for.inc39:
-// CHECK1-NORMAL:       omp.inner.for.inc38:
+// CHECK1:       omp.inner.for.inc38:
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK1-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK1-IRBUILDER:       omp.inner.for.end42:
-// CHECK1-NORMAL:       omp.inner.for.end40:
+// CHECK1:       omp.inner.for.end40:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -722,9 +689,7 @@
 // CHECK1:       .omp.final.done:
 // CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK1:       omp.precond.end:
-// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -768,7 +733,7 @@
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -777,12 +742,9 @@
 // CHECK2-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK2-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
-// CHECK2-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK2-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -800,7 +762,6 @@
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK2-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -832,7 +793,6 @@
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -840,9 +800,7 @@
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK2:       omp.dispatch.end:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
-// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -860,7 +818,7 @@
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -869,11 +827,9 @@
 // CHECK2-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK2-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741891, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -892,7 +848,6 @@
 // CHECK2-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK2-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK2-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -920,7 +875,6 @@
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK2-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK2-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -928,9 +882,7 @@
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK2:       omp.dispatch.end:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -956,7 +908,7 @@
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK2-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -989,11 +941,9 @@
 // CHECK2-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741894, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK2-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1024,7 +974,6 @@
 // CHECK2-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK2-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK2-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -1056,7 +1005,6 @@
 // CHECK2-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK2-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK2-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -1066,9 +1014,7 @@
 // CHECK2:       omp.dispatch.end:
 // CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK2:       omp.precond.end:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1089,7 +1035,7 @@
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK2-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1099,11 +1045,9 @@
 // CHECK2-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK2-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741893, i32 0, i32 199, i32 1, i32 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1131,7 +1075,6 @@
 // CHECK2-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK2-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -1163,7 +1106,6 @@
 // CHECK2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK2-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -1171,9 +1113,7 @@
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK2:       omp.dispatch.end:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1200,7 +1140,7 @@
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -1287,11 +1227,9 @@
 // CHECK2-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK2-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK2-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1299,15 +1237,13 @@
 // CHECK2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK2-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK2-IRBUILDER:       omp.inner.for.cond30:
-// CHECK2-NORMAL:       omp.inner.for.cond29:
+// CHECK2:       omp.inner.for.cond29:
 // CHECK2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK2-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK2-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK2-IRBUILDER:       omp.inner.for.body33:
-// CHECK2-NORMAL:       omp.inner.for.body32:
+// CHECK2:       omp.inner.for.body32:
 // CHECK2-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -1319,19 +1255,15 @@
 // CHECK2-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK2-IRBUILDER:       omp.body.continue38:
-// CHECK2-NORMAL:       omp.body.continue37:
+// CHECK2:       omp.body.continue37:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK2-IRBUILDER:       omp.inner.for.inc39:
-// CHECK2-NORMAL:       omp.inner.for.inc38:
+// CHECK2:       omp.inner.for.inc38:
 // CHECK2-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK2-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK2-IRBUILDER:       omp.inner.for.end42:
-// CHECK2-NORMAL:       omp.inner.for.end40:
+// CHECK2:       omp.inner.for.end40:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -1354,9 +1286,7 @@
 // CHECK2:       .omp.final.done:
 // CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK2:       omp.precond.end:
-// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1386,6 +1316,1262 @@
 // CHECK2-NEXT:    ret void
 //
 //
+// CHECK1-IRBUILDER-LABEL: define {{[^@]+}}@_Z18static_not_chunkedPfS_S_S_
+// CHECK1-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-IRBUILDER-NEXT:  entry:
+// CHECK1-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK1-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK1-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 7
+// CHECK1-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
+// CHECK1-IRBUILDER-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[I]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[I]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[IDXPROM3]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM6:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM6]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX7]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[TMP15]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM9]]
+// CHECK1-IRBUILDER-NEXT:    store float [[MUL8]], float* [[ARRAYIDX10]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc:
+// CHECK1-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1-IRBUILDER:       omp.inner.for.end:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.inc:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK1-IRBUILDER:       omp.dispatch.end:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
+// CHECK1-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK1-IRBUILDER-LABEL: define {{[^@]+}}@_Z8dynamic1PfS_S_S_
+// CHECK1-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK1-IRBUILDER-NEXT:  entry:
+// CHECK1-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1073741891, i64 0, i64 16908287, i64 1, i64 1)
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK1-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK1-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i64 [[TMP1]], i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[ADD:%.*]] = add i64 [[TMP3]], 1
+// CHECK1-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp ult i64 [[TMP2]], [[ADD]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i64 [[TMP4]], 127
+// CHECK1-IRBUILDER-NEXT:    [[ADD2:%.*]] = add i64 131071, [[MUL]]
+// CHECK1-IRBUILDER-NEXT:    store i64 [[ADD2]], i64* [[I]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, i64* [[I]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[TMP6]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[I]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[TMP9]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[I]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP12]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, i64* [[I]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP15]]
+// CHECK1-IRBUILDER-NEXT:    store float [[MUL7]], float* [[ARRAYIDX8]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc:
+// CHECK1-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i64 [[TMP16]], 1
+// CHECK1-IRBUILDER-NEXT:    store i64 [[ADD9]], i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1-IRBUILDER:       omp.inner.for.end:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.inc:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK1-IRBUILDER:       omp.dispatch.end:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK1-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK1-IRBUILDER-LABEL: define {{[^@]+}}@_Z9test_autoPfS_S_S_
+// CHECK1-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK1-IRBUILDER-NEXT:  entry:
+// CHECK1-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[Y:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK1-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK1-IRBUILDER-NEXT:    [[X6:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I8:%.*]] = alloca i8, align 1
+// CHECK1-IRBUILDER-NEXT:    [[X9:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[Y]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[Y]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP0]] to i8
+// CHECK1-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP1]] to i32
+// CHECK1-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 57, [[CONV3]]
+// CHECK1-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], 1
+// CHECK1-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK1-IRBUILDER-NEXT:    [[CONV4:%.*]] = zext i32 [[DIV]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV4]], 11
+// CHECK1-IRBUILDER-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK1-IRBUILDER-NEXT:    store i64 [[SUB5]], i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-IRBUILDER-NEXT:    store i8 [[TMP2]], i8* [[I]], align 1
+// CHECK1-IRBUILDER-NEXT:    store i32 11, i32* [[X6]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[CONV7:%.*]] = sext i8 [[TMP3]] to i32
+// CHECK1-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[CONV7]], 57
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1-IRBUILDER:       omp.precond.then:
+// CHECK1-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i64 [[TMP4]], i64* [[DOTOMP_UB]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1073741894, i64 0, i64 [[TMP5]], i64 1, i64 1)
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK1-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK1-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i64 [[TMP7]], i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[CMP11:%.*]] = icmp sle i64 [[TMP8]], [[TMP9]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[CONV12:%.*]] = sext i8 [[TMP10]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[DIV13:%.*]] = sdiv i64 [[TMP11]], 11
+// CHECK1-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul nsw i64 [[DIV13]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD15:%.*]] = add nsw i64 [[CONV12]], [[MUL14]]
+// CHECK1-IRBUILDER-NEXT:    [[CONV16:%.*]] = trunc i64 [[ADD15]] to i8
+// CHECK1-IRBUILDER-NEXT:    store i8 [[CONV16]], i8* [[I8]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[DIV17:%.*]] = sdiv i64 [[TMP13]], 11
+// CHECK1-IRBUILDER-NEXT:    [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 11
+// CHECK1-IRBUILDER-NEXT:    [[SUB19:%.*]] = sub nsw i64 [[TMP12]], [[MUL18]]
+// CHECK1-IRBUILDER-NEXT:    [[MUL20:%.*]] = mul nsw i64 [[SUB19]], 1
+// CHECK1-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub nsw i64 11, [[MUL20]]
+// CHECK1-IRBUILDER-NEXT:    [[CONV22:%.*]] = trunc i64 [[SUB21]] to i32
+// CHECK1-IRBUILDER-NEXT:    store i32 [[CONV22]], i32* [[X9]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM23:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i8 [[TMP15]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP17:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM24:%.*]] = sext i8 [[TMP18]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, float* [[TMP17]], i64 [[IDXPROM24]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX25]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL26:%.*]] = fmul float [[TMP16]], [[TMP19]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP20:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM27:%.*]] = sext i8 [[TMP21]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[IDXPROM27]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX28]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL29:%.*]] = fmul float [[MUL26]], [[TMP22]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP23:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM30:%.*]] = sext i8 [[TMP24]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, float* [[TMP23]], i64 [[IDXPROM30]]
+// CHECK1-IRBUILDER-NEXT:    store float [[MUL29]], float* [[ARRAYIDX31]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc:
+// CHECK1-IRBUILDER-NEXT:    [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[ADD32:%.*]] = add nsw i64 [[TMP25]], 1
+// CHECK1-IRBUILDER-NEXT:    store i64 [[ADD32]], i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1-IRBUILDER:       omp.inner.for.end:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.inc:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK1-IRBUILDER:       omp.dispatch.end:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1-IRBUILDER:       omp.precond.end:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM34:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM34]])
+// CHECK1-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK1-IRBUILDER-LABEL: define {{[^@]+}}@_Z7runtimePfS_S_S_
+// CHECK1-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK1-IRBUILDER-NEXT:  entry:
+// CHECK1-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK1-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK1-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK1-IRBUILDER-NEXT:    [[X2:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1073741893, i32 0, i32 199, i32 1, i32 1)
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK1-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK1-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP4]], 20
+// CHECK1-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 48, [[MUL]]
+// CHECK1-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i8
+// CHECK1-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[I]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP6]], 20
+// CHECK1-IRBUILDER-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 20
+// CHECK1-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP5]], [[MUL5]]
+// CHECK1-IRBUILDER-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD7:%.*]] = add nsw i32 -10, [[MUL6]]
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD7]], i32* [[X2]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, i8* [[I]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP7]], i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, i8* [[I]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP10]], i64 [[IDXPROM9]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX10]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, i8* [[I]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[IDXPROM12]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
+// CHECK1-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, i8* [[I]], align 1
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[TMP16]], i64 [[IDXPROM15]]
+// CHECK1-IRBUILDER-NEXT:    store float [[MUL14]], float* [[ARRAYIDX16]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc:
+// CHECK1-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM18:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM18]])
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK1-IRBUILDER:       omp.inner.for.end:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.inc:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK1-IRBUILDER:       omp.dispatch.end:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM19:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM19]])
+// CHECK1-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK1-IRBUILDER-LABEL: define {{[^@]+}}@_Z8foo_simdii
+// CHECK1-IRBUILDER-SAME: (i32 noundef [[LOW:%.*]], i32 noundef [[UP:%.*]]) #[[ATTR0]] {
+// CHECK1-IRBUILDER-NEXT:  entry:
+// CHECK1-IRBUILDER-NEXT:    [[LOW_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[UP_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I5:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IV16:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[_TMP17:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_18:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_20:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I26:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    [[I28:%.*]] = alloca i32, align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+// CHECK1-IRBUILDER-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
+// CHECK1-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK1-IRBUILDER-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-IRBUILDER-NEXT:    store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP4]], i32* [[I]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP5]], [[TMP6]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK1-IRBUILDER:       simd.if.then:
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    [[ADD6:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-IRBUILDER-NEXT:    [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i32 [[TMP10]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]]
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD8]], i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i32, i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    call void @__captured_stmt(i32* [[I5]]), !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc:
+// CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i32 [[TMP12]], 1
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// CHECK1-IRBUILDER:       omp.inner.for.end:
+// CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+// CHECK1-IRBUILDER-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], 1
+// CHECK1-IRBUILDER-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], 1
+// CHECK1-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul i32 [[DIV13]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD15:%.*]] = add i32 [[TMP13]], [[MUL14]]
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD15]], i32* [[I5]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[SIMD_IF_END]]
+// CHECK1-IRBUILDER:       simd.if.end:
+// CHECK1-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP16]], i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP17]], i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub i32 [[TMP18]], [[TMP19]]
+// CHECK1-IRBUILDER-NEXT:    [[SUB22:%.*]] = sub i32 [[SUB21]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD23:%.*]] = add i32 [[SUB22]], 1
+// CHECK1-IRBUILDER-NEXT:    [[DIV24:%.*]] = udiv i32 [[ADD23]], 1
+// CHECK1-IRBUILDER-NEXT:    [[SUB25:%.*]] = sub i32 [[DIV24]], 1
+// CHECK1-IRBUILDER-NEXT:    store i32 [[SUB25]], i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP20]], i32* [[I26]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[CMP27:%.*]] = icmp slt i32 [[TMP21]], [[TMP22]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP27]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1-IRBUILDER:       omp.precond.then:
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 [[TMP24]], i32 1, i32 1)
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM29:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM29]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK1-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP25]], 0
+// CHECK1-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.body:
+// CHECK1-IRBUILDER-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK1-IRBUILDER-NEXT:    store i32 [[TMP26]], i32* [[DOTOMP_IV16]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.cond30:
+// CHECK1-IRBUILDER-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[ADD31:%.*]] = add i32 [[TMP28]], 1
+// CHECK1-IRBUILDER-NEXT:    [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]]
+// CHECK1-IRBUILDER-NEXT:    br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body33:
+// CHECK1-IRBUILDER-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[MUL34:%.*]] = mul i32 [[TMP30]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]]
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD35]], i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[TMP31:%.*]] = load i32, i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM36]]
+// CHECK1-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX37]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.body33.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue38:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc39:
+// CHECK1-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD40]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1-IRBUILDER:       omp.inner.for.end42:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK1-IRBUILDER:       omp.dispatch.inc:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK1-IRBUILDER:       omp.dispatch.end:
+// CHECK1-IRBUILDER-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK1-IRBUILDER-NEXT:    br i1 [[TMP34]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK1-IRBUILDER:       .omp.final.then:
+// CHECK1-IRBUILDER-NEXT:    [[TMP35:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[SUB43:%.*]] = sub i32 [[TMP36]], [[TMP37]]
+// CHECK1-IRBUILDER-NEXT:    [[SUB44:%.*]] = sub i32 [[SUB43]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD45:%.*]] = add i32 [[SUB44]], 1
+// CHECK1-IRBUILDER-NEXT:    [[DIV46:%.*]] = udiv i32 [[ADD45]], 1
+// CHECK1-IRBUILDER-NEXT:    [[MUL47:%.*]] = mul i32 [[DIV46]], 1
+// CHECK1-IRBUILDER-NEXT:    [[ADD48:%.*]] = add i32 [[TMP35]], [[MUL47]]
+// CHECK1-IRBUILDER-NEXT:    store i32 [[ADD48]], i32* [[I28]], align 4
+// CHECK1-IRBUILDER-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK1-IRBUILDER:       .omp.final.done:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1-IRBUILDER:       omp.precond.end:
+// CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM49:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM49]])
+// CHECK1-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK1-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK1-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-IRBUILDER-NEXT:  entry:
+// CHECK1-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK1-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK1-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK1-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK1-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3]] {
+// CHECK1-IRBUILDER-NEXT:  entry:
+// CHECK1-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK1-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK1-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK2-IRBUILDER-LABEL: define {{[^@]+}}@_Z18static_not_chunkedPfS_S_S_
+// CHECK2-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-IRBUILDER-NEXT:  entry:
+// CHECK2-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK2-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK2-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 7
+// CHECK2-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
+// CHECK2-IRBUILDER-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[I]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[IDXPROM]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[I]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[IDXPROM3]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM6:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM6]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX7]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[TMP15]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM9]]
+// CHECK2-IRBUILDER-NEXT:    store float [[MUL8]], float* [[ARRAYIDX10]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2-IRBUILDER:       omp.body.continue:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.inc:
+// CHECK2-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2-IRBUILDER:       omp.inner.for.end:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.inc:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK2-IRBUILDER:       omp.dispatch.end:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
+// CHECK2-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK2-IRBUILDER-LABEL: define {{[^@]+}}@_Z8dynamic1PfS_S_S_
+// CHECK2-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK2-IRBUILDER-NEXT:  entry:
+// CHECK2-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1073741891, i64 0, i64 16908287, i64 1, i64 1)
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK2-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK2-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i64 [[TMP1]], i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[ADD:%.*]] = add i64 [[TMP3]], 1
+// CHECK2-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp ult i64 [[TMP2]], [[ADD]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i64 [[TMP4]], 127
+// CHECK2-IRBUILDER-NEXT:    [[ADD2:%.*]] = add i64 131071, [[MUL]]
+// CHECK2-IRBUILDER-NEXT:    store i64 [[ADD2]], i64* [[I]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, i64* [[I]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[TMP6]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[I]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[TMP9]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[I]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP12]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, i64* [[I]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP15]]
+// CHECK2-IRBUILDER-NEXT:    store float [[MUL7]], float* [[ARRAYIDX8]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2-IRBUILDER:       omp.body.continue:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.inc:
+// CHECK2-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i64 [[TMP16]], 1
+// CHECK2-IRBUILDER-NEXT:    store i64 [[ADD9]], i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2-IRBUILDER:       omp.inner.for.end:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.inc:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK2-IRBUILDER:       omp.dispatch.end:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK2-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK2-IRBUILDER-LABEL: define {{[^@]+}}@_Z9test_autoPfS_S_S_
+// CHECK2-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK2-IRBUILDER-NEXT:  entry:
+// CHECK2-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[Y:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK2-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK2-IRBUILDER-NEXT:    [[X6:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I8:%.*]] = alloca i8, align 1
+// CHECK2-IRBUILDER-NEXT:    [[X9:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[Y]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[Y]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP0]] to i8
+// CHECK2-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP1]] to i32
+// CHECK2-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 57, [[CONV3]]
+// CHECK2-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], 1
+// CHECK2-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK2-IRBUILDER-NEXT:    [[CONV4:%.*]] = zext i32 [[DIV]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV4]], 11
+// CHECK2-IRBUILDER-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK2-IRBUILDER-NEXT:    store i64 [[SUB5]], i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK2-IRBUILDER-NEXT:    store i8 [[TMP2]], i8* [[I]], align 1
+// CHECK2-IRBUILDER-NEXT:    store i32 11, i32* [[X6]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[CONV7:%.*]] = sext i8 [[TMP3]] to i32
+// CHECK2-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[CONV7]], 57
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2-IRBUILDER:       omp.precond.then:
+// CHECK2-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i64 [[TMP4]], i64* [[DOTOMP_UB]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1073741894, i64 0, i64 [[TMP5]], i64 1, i64 1)
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK2-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK2-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i64 [[TMP7]], i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[CMP11:%.*]] = icmp sle i64 [[TMP8]], [[TMP9]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[CONV12:%.*]] = sext i8 [[TMP10]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[DIV13:%.*]] = sdiv i64 [[TMP11]], 11
+// CHECK2-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul nsw i64 [[DIV13]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD15:%.*]] = add nsw i64 [[CONV12]], [[MUL14]]
+// CHECK2-IRBUILDER-NEXT:    [[CONV16:%.*]] = trunc i64 [[ADD15]] to i8
+// CHECK2-IRBUILDER-NEXT:    store i8 [[CONV16]], i8* [[I8]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[DIV17:%.*]] = sdiv i64 [[TMP13]], 11
+// CHECK2-IRBUILDER-NEXT:    [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 11
+// CHECK2-IRBUILDER-NEXT:    [[SUB19:%.*]] = sub nsw i64 [[TMP12]], [[MUL18]]
+// CHECK2-IRBUILDER-NEXT:    [[MUL20:%.*]] = mul nsw i64 [[SUB19]], 1
+// CHECK2-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub nsw i64 11, [[MUL20]]
+// CHECK2-IRBUILDER-NEXT:    [[CONV22:%.*]] = trunc i64 [[SUB21]] to i32
+// CHECK2-IRBUILDER-NEXT:    store i32 [[CONV22]], i32* [[X9]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM23:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i8 [[TMP15]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP17:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM24:%.*]] = sext i8 [[TMP18]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, float* [[TMP17]], i64 [[IDXPROM24]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX25]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL26:%.*]] = fmul float [[TMP16]], [[TMP19]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP20:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM27:%.*]] = sext i8 [[TMP21]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[IDXPROM27]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX28]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL29:%.*]] = fmul float [[MUL26]], [[TMP22]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP23:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM30:%.*]] = sext i8 [[TMP24]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, float* [[TMP23]], i64 [[IDXPROM30]]
+// CHECK2-IRBUILDER-NEXT:    store float [[MUL29]], float* [[ARRAYIDX31]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2-IRBUILDER:       omp.body.continue:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.inc:
+// CHECK2-IRBUILDER-NEXT:    [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[ADD32:%.*]] = add nsw i64 [[TMP25]], 1
+// CHECK2-IRBUILDER-NEXT:    store i64 [[ADD32]], i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2-IRBUILDER:       omp.inner.for.end:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.inc:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK2-IRBUILDER:       omp.dispatch.end:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK2-IRBUILDER:       omp.precond.end:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM34:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM34]])
+// CHECK2-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK2-IRBUILDER-LABEL: define {{[^@]+}}@_Z7runtimePfS_S_S_
+// CHECK2-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK2-IRBUILDER-NEXT:  entry:
+// CHECK2-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK2-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK2-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK2-IRBUILDER-NEXT:    [[X2:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1073741893, i32 0, i32 199, i32 1, i32 1)
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK2-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK2-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP4]], 20
+// CHECK2-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 48, [[MUL]]
+// CHECK2-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i8
+// CHECK2-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[I]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP6]], 20
+// CHECK2-IRBUILDER-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 20
+// CHECK2-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP5]], [[MUL5]]
+// CHECK2-IRBUILDER-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD7:%.*]] = add nsw i32 -10, [[MUL6]]
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD7]], i32* [[X2]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, i8* [[I]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP7]], i64 [[IDXPROM]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, i8* [[I]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP10]], i64 [[IDXPROM9]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX10]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, i8* [[I]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[IDXPROM12]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
+// CHECK2-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, i8* [[I]], align 1
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[TMP16]], i64 [[IDXPROM15]]
+// CHECK2-IRBUILDER-NEXT:    store float [[MUL14]], float* [[ARRAYIDX16]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2-IRBUILDER:       omp.body.continue:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.inc:
+// CHECK2-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM18:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM18]])
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK2-IRBUILDER:       omp.inner.for.end:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.inc:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK2-IRBUILDER:       omp.dispatch.end:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM19:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM19]])
+// CHECK2-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK2-IRBUILDER-LABEL: define {{[^@]+}}@_Z8foo_simdii
+// CHECK2-IRBUILDER-SAME: (i32 noundef [[LOW:%.*]], i32 noundef [[UP:%.*]]) #[[ATTR0]] {
+// CHECK2-IRBUILDER-NEXT:  entry:
+// CHECK2-IRBUILDER-NEXT:    [[LOW_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[UP_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I5:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IV16:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[_TMP17:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_18:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_20:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I26:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    [[I28:%.*]] = alloca i32, align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+// CHECK2-IRBUILDER-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
+// CHECK2-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK2-IRBUILDER-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-IRBUILDER-NEXT:    store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP4]], i32* [[I]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP5]], [[TMP6]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK2-IRBUILDER:       simd.if.then:
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    [[ADD6:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-IRBUILDER-NEXT:    [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i32 [[TMP10]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]]
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD8]], i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i32, i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK2-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    call void @__captured_stmt(i32* [[I5]]), !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2-IRBUILDER:       omp.body.continue:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.inc:
+// CHECK2-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i32 [[TMP12]], 1
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// CHECK2-IRBUILDER:       omp.inner.for.end:
+// CHECK2-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+// CHECK2-IRBUILDER-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], 1
+// CHECK2-IRBUILDER-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], 1
+// CHECK2-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul i32 [[DIV13]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD15:%.*]] = add i32 [[TMP13]], [[MUL14]]
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD15]], i32* [[I5]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[SIMD_IF_END]]
+// CHECK2-IRBUILDER:       simd.if.end:
+// CHECK2-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP16]], i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP17]], i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub i32 [[TMP18]], [[TMP19]]
+// CHECK2-IRBUILDER-NEXT:    [[SUB22:%.*]] = sub i32 [[SUB21]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD23:%.*]] = add i32 [[SUB22]], 1
+// CHECK2-IRBUILDER-NEXT:    [[DIV24:%.*]] = udiv i32 [[ADD23]], 1
+// CHECK2-IRBUILDER-NEXT:    [[SUB25:%.*]] = sub i32 [[DIV24]], 1
+// CHECK2-IRBUILDER-NEXT:    store i32 [[SUB25]], i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP20]], i32* [[I26]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[CMP27:%.*]] = icmp slt i32 [[TMP21]], [[TMP22]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP27]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2-IRBUILDER:       omp.precond.then:
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 [[TMP24]], i32 1, i32 1)
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM29:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM29]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK2-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP25]], 0
+// CHECK2-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.body:
+// CHECK2-IRBUILDER-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK2-IRBUILDER-NEXT:    store i32 [[TMP26]], i32* [[DOTOMP_IV16]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.cond30:
+// CHECK2-IRBUILDER-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[ADD31:%.*]] = add i32 [[TMP28]], 1
+// CHECK2-IRBUILDER-NEXT:    [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]]
+// CHECK2-IRBUILDER-NEXT:    br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body33:
+// CHECK2-IRBUILDER-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[MUL34:%.*]] = mul i32 [[TMP30]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]]
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD35]], i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[TMP31:%.*]] = load i32, i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM36]]
+// CHECK2-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX37]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.body33.ordered.after:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
+// CHECK2-IRBUILDER:       omp.body.continue38:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
+// CHECK2-IRBUILDER:       omp.inner.for.inc39:
+// CHECK2-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD40]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2-IRBUILDER:       omp.inner.for.end42:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK2-IRBUILDER:       omp.dispatch.inc:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK2-IRBUILDER:       omp.dispatch.end:
+// CHECK2-IRBUILDER-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK2-IRBUILDER-NEXT:    br i1 [[TMP34]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK2-IRBUILDER:       .omp.final.then:
+// CHECK2-IRBUILDER-NEXT:    [[TMP35:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[SUB43:%.*]] = sub i32 [[TMP36]], [[TMP37]]
+// CHECK2-IRBUILDER-NEXT:    [[SUB44:%.*]] = sub i32 [[SUB43]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD45:%.*]] = add i32 [[SUB44]], 1
+// CHECK2-IRBUILDER-NEXT:    [[DIV46:%.*]] = udiv i32 [[ADD45]], 1
+// CHECK2-IRBUILDER-NEXT:    [[MUL47:%.*]] = mul i32 [[DIV46]], 1
+// CHECK2-IRBUILDER-NEXT:    [[ADD48:%.*]] = add i32 [[TMP35]], [[MUL47]]
+// CHECK2-IRBUILDER-NEXT:    store i32 [[ADD48]], i32* [[I28]], align 4
+// CHECK2-IRBUILDER-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK2-IRBUILDER:       .omp.final.done:
+// CHECK2-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK2-IRBUILDER:       omp.precond.end:
+// CHECK2-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM49:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM49]])
+// CHECK2-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK2-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK2-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK2-IRBUILDER-NEXT:  entry:
+// CHECK2-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK2-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK2-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK2-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK2-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK2-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3]] {
+// CHECK2-IRBUILDER-NEXT:  entry:
+// CHECK2-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK2-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK2-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK2-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK2-IRBUILDER-NEXT:    ret void
+//
+//
 // CHECK3-LABEL: define {{[^@]+}}@_Z18static_not_chunkedPfS_S_S_
 // CHECK3-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK3-NEXT:  entry:
@@ -1400,7 +2586,7 @@
 // CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1409,12 +2595,9 @@
 // CHECK3-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
-// CHECK3-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK3-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1432,7 +2615,6 @@
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK3-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -1464,7 +2646,6 @@
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1472,9 +2653,7 @@
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK3:       omp.dispatch.end:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
-// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1492,7 +2671,7 @@
 // CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1501,11 +2680,9 @@
 // CHECK3-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK3-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 67, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1524,7 +2701,6 @@
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK3-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK3-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -1552,7 +2728,6 @@
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK3-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1560,9 +2735,7 @@
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK3:       omp.dispatch.end:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1588,7 +2761,7 @@
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK3-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1621,11 +2794,9 @@
 // CHECK3-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 70, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK3-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1656,7 +2827,6 @@
 // CHECK3-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK3-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK3-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -1688,7 +2858,6 @@
 // CHECK3-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK3-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1698,9 +2867,7 @@
 // CHECK3:       omp.dispatch.end:
 // CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK3:       omp.precond.end:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1721,7 +2888,7 @@
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK3-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1731,11 +2898,9 @@
 // CHECK3-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 69, i32 0, i32 199, i32 1, i32 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1763,7 +2928,6 @@
 // CHECK3-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK3-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -1795,7 +2959,6 @@
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK3-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1803,9 +2966,7 @@
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK3:       omp.dispatch.end:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1832,7 +2993,7 @@
 // CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -1919,11 +3080,9 @@
 // CHECK3-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK3-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1931,15 +3090,13 @@
 // CHECK3-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK3-IRBUILDER:       omp.inner.for.cond30:
-// CHECK3-NORMAL:       omp.inner.for.cond29:
+// CHECK3:       omp.inner.for.cond29:
 // CHECK3-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK3-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK3-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK3-IRBUILDER:       omp.inner.for.body33:
-// CHECK3-NORMAL:       omp.inner.for.body32:
+// CHECK3:       omp.inner.for.body32:
 // CHECK3-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -1951,19 +3108,15 @@
 // CHECK3-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK3-IRBUILDER:       omp.body.continue38:
-// CHECK3-NORMAL:       omp.body.continue37:
+// CHECK3:       omp.body.continue37:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK3-IRBUILDER:       omp.inner.for.inc39:
-// CHECK3-NORMAL:       omp.inner.for.inc38:
+// CHECK3:       omp.inner.for.inc38:
 // CHECK3-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK3-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK3-IRBUILDER:       omp.inner.for.end42:
-// CHECK3-NORMAL:       omp.inner.for.end40:
+// CHECK3:       omp.inner.for.end40:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -1986,9 +3139,7 @@
 // CHECK3:       .omp.final.done:
 // CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK3:       omp.precond.end:
-// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -2032,7 +3183,7 @@
 // CHECK4-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -2041,12 +3192,9 @@
 // CHECK4-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK4-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
-// CHECK4-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK4-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2064,7 +3212,6 @@
 // CHECK4-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK4-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK4-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -2096,7 +3243,6 @@
 // CHECK4-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK4-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -2104,9 +3250,7 @@
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK4:       omp.dispatch.end:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
-// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2124,7 +3268,7 @@
 // CHECK4-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -2133,11 +3277,9 @@
 // CHECK4-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK4-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 67, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2156,7 +3298,6 @@
 // CHECK4-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK4-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK4-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -2184,7 +3325,6 @@
 // CHECK4-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK4-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK4-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -2192,9 +3332,7 @@
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK4:       omp.dispatch.end:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2220,7 +3358,7 @@
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK4-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -2253,11 +3391,9 @@
 // CHECK4-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK4-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 70, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK4-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2288,7 +3424,6 @@
 // CHECK4-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK4-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK4-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -2320,7 +3455,6 @@
 // CHECK4-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK4-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK4-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -2330,9 +3464,7 @@
 // CHECK4:       omp.dispatch.end:
 // CHECK4-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK4:       omp.precond.end:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2353,7 +3485,7 @@
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK4-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -2363,11 +3495,9 @@
 // CHECK4-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK4-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 69, i32 0, i32 199, i32 1, i32 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2395,7 +3525,6 @@
 // CHECK4-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK4-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK4-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -2427,7 +3556,6 @@
 // CHECK4-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK4-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK4-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -2435,9 +3563,7 @@
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK4:       omp.dispatch.end:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2464,7 +3590,7 @@
 // CHECK4-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -2551,11 +3677,9 @@
 // CHECK4-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK4-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK4-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2563,15 +3687,13 @@
 // CHECK4-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK4-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK4-IRBUILDER:       omp.inner.for.cond30:
-// CHECK4-NORMAL:       omp.inner.for.cond29:
+// CHECK4:       omp.inner.for.cond29:
 // CHECK4-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK4-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK4-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK4-IRBUILDER:       omp.inner.for.body33:
-// CHECK4-NORMAL:       omp.inner.for.body32:
+// CHECK4:       omp.inner.for.body32:
 // CHECK4-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -2583,19 +3705,15 @@
 // CHECK4-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK4-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK4-IRBUILDER:       omp.body.continue38:
-// CHECK4-NORMAL:       omp.body.continue37:
+// CHECK4:       omp.body.continue37:
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK4-IRBUILDER:       omp.inner.for.inc39:
-// CHECK4-NORMAL:       omp.inner.for.inc38:
+// CHECK4:       omp.inner.for.inc38:
 // CHECK4-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK4-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK4-IRBUILDER:       omp.inner.for.end42:
-// CHECK4-NORMAL:       omp.inner.for.end40:
+// CHECK4:       omp.inner.for.end40:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -2618,9 +3736,7 @@
 // CHECK4:       .omp.final.done:
 // CHECK4-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK4:       omp.precond.end:
-// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
-// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2650,6 +3766,1262 @@
 // CHECK4-NEXT:    ret void
 //
 //
+// CHECK3-IRBUILDER-LABEL: define {{[^@]+}}@_Z18static_not_chunkedPfS_S_S_
+// CHECK3-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-IRBUILDER-NEXT:  entry:
+// CHECK3-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK3-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK3-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 7
+// CHECK3-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
+// CHECK3-IRBUILDER-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[I]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[I]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[IDXPROM3]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM6:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM6]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX7]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[TMP15]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM9]]
+// CHECK3-IRBUILDER-NEXT:    store float [[MUL8]], float* [[ARRAYIDX10]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc:
+// CHECK3-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK3-IRBUILDER:       omp.inner.for.end:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.inc:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK3-IRBUILDER:       omp.dispatch.end:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
+// CHECK3-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK3-IRBUILDER-LABEL: define {{[^@]+}}@_Z8dynamic1PfS_S_S_
+// CHECK3-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK3-IRBUILDER-NEXT:  entry:
+// CHECK3-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 67, i64 0, i64 16908287, i64 1, i64 1)
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK3-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK3-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i64 [[TMP1]], i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[ADD:%.*]] = add i64 [[TMP3]], 1
+// CHECK3-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp ult i64 [[TMP2]], [[ADD]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i64 [[TMP4]], 127
+// CHECK3-IRBUILDER-NEXT:    [[ADD2:%.*]] = add i64 131071, [[MUL]]
+// CHECK3-IRBUILDER-NEXT:    store i64 [[ADD2]], i64* [[I]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, i64* [[I]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[TMP6]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[I]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[TMP9]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[I]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP12]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, i64* [[I]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP15]]
+// CHECK3-IRBUILDER-NEXT:    store float [[MUL7]], float* [[ARRAYIDX8]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc:
+// CHECK3-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i64 [[TMP16]], 1
+// CHECK3-IRBUILDER-NEXT:    store i64 [[ADD9]], i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK3-IRBUILDER:       omp.inner.for.end:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.inc:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK3-IRBUILDER:       omp.dispatch.end:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK3-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK3-IRBUILDER-LABEL: define {{[^@]+}}@_Z9test_autoPfS_S_S_
+// CHECK3-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK3-IRBUILDER-NEXT:  entry:
+// CHECK3-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[Y:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK3-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK3-IRBUILDER-NEXT:    [[X6:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I8:%.*]] = alloca i8, align 1
+// CHECK3-IRBUILDER-NEXT:    [[X9:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[Y]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[Y]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP0]] to i8
+// CHECK3-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP1]] to i32
+// CHECK3-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 57, [[CONV3]]
+// CHECK3-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], 1
+// CHECK3-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK3-IRBUILDER-NEXT:    [[CONV4:%.*]] = zext i32 [[DIV]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV4]], 11
+// CHECK3-IRBUILDER-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK3-IRBUILDER-NEXT:    store i64 [[SUB5]], i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK3-IRBUILDER-NEXT:    store i8 [[TMP2]], i8* [[I]], align 1
+// CHECK3-IRBUILDER-NEXT:    store i32 11, i32* [[X6]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[CONV7:%.*]] = sext i8 [[TMP3]] to i32
+// CHECK3-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[CONV7]], 57
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3-IRBUILDER:       omp.precond.then:
+// CHECK3-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i64 [[TMP4]], i64* [[DOTOMP_UB]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 70, i64 0, i64 [[TMP5]], i64 1, i64 1)
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK3-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK3-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i64 [[TMP7]], i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[CMP11:%.*]] = icmp sle i64 [[TMP8]], [[TMP9]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[CONV12:%.*]] = sext i8 [[TMP10]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[DIV13:%.*]] = sdiv i64 [[TMP11]], 11
+// CHECK3-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul nsw i64 [[DIV13]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD15:%.*]] = add nsw i64 [[CONV12]], [[MUL14]]
+// CHECK3-IRBUILDER-NEXT:    [[CONV16:%.*]] = trunc i64 [[ADD15]] to i8
+// CHECK3-IRBUILDER-NEXT:    store i8 [[CONV16]], i8* [[I8]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[DIV17:%.*]] = sdiv i64 [[TMP13]], 11
+// CHECK3-IRBUILDER-NEXT:    [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 11
+// CHECK3-IRBUILDER-NEXT:    [[SUB19:%.*]] = sub nsw i64 [[TMP12]], [[MUL18]]
+// CHECK3-IRBUILDER-NEXT:    [[MUL20:%.*]] = mul nsw i64 [[SUB19]], 1
+// CHECK3-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub nsw i64 11, [[MUL20]]
+// CHECK3-IRBUILDER-NEXT:    [[CONV22:%.*]] = trunc i64 [[SUB21]] to i32
+// CHECK3-IRBUILDER-NEXT:    store i32 [[CONV22]], i32* [[X9]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM23:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i8 [[TMP15]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP17:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM24:%.*]] = sext i8 [[TMP18]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, float* [[TMP17]], i64 [[IDXPROM24]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX25]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL26:%.*]] = fmul float [[TMP16]], [[TMP19]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP20:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM27:%.*]] = sext i8 [[TMP21]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[IDXPROM27]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX28]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL29:%.*]] = fmul float [[MUL26]], [[TMP22]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP23:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM30:%.*]] = sext i8 [[TMP24]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, float* [[TMP23]], i64 [[IDXPROM30]]
+// CHECK3-IRBUILDER-NEXT:    store float [[MUL29]], float* [[ARRAYIDX31]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc:
+// CHECK3-IRBUILDER-NEXT:    [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[ADD32:%.*]] = add nsw i64 [[TMP25]], 1
+// CHECK3-IRBUILDER-NEXT:    store i64 [[ADD32]], i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK3-IRBUILDER:       omp.inner.for.end:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.inc:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK3-IRBUILDER:       omp.dispatch.end:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK3-IRBUILDER:       omp.precond.end:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM34:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM34]])
+// CHECK3-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK3-IRBUILDER-LABEL: define {{[^@]+}}@_Z7runtimePfS_S_S_
+// CHECK3-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK3-IRBUILDER-NEXT:  entry:
+// CHECK3-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK3-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK3-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK3-IRBUILDER-NEXT:    [[X2:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 69, i32 0, i32 199, i32 1, i32 1)
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK3-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK3-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP4]], 20
+// CHECK3-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 48, [[MUL]]
+// CHECK3-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i8
+// CHECK3-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[I]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP6]], 20
+// CHECK3-IRBUILDER-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 20
+// CHECK3-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP5]], [[MUL5]]
+// CHECK3-IRBUILDER-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD7:%.*]] = add nsw i32 -10, [[MUL6]]
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD7]], i32* [[X2]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, i8* [[I]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP7]], i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, i8* [[I]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP10]], i64 [[IDXPROM9]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX10]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, i8* [[I]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[IDXPROM12]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
+// CHECK3-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, i8* [[I]], align 1
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[TMP16]], i64 [[IDXPROM15]]
+// CHECK3-IRBUILDER-NEXT:    store float [[MUL14]], float* [[ARRAYIDX16]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc:
+// CHECK3-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM18:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM18]])
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK3-IRBUILDER:       omp.inner.for.end:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.inc:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK3-IRBUILDER:       omp.dispatch.end:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM19:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM19]])
+// CHECK3-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK3-IRBUILDER-LABEL: define {{[^@]+}}@_Z8foo_simdii
+// CHECK3-IRBUILDER-SAME: (i32 noundef [[LOW:%.*]], i32 noundef [[UP:%.*]]) #[[ATTR0]] {
+// CHECK3-IRBUILDER-NEXT:  entry:
+// CHECK3-IRBUILDER-NEXT:    [[LOW_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[UP_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I5:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IV16:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[_TMP17:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_18:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_20:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I26:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    [[I28:%.*]] = alloca i32, align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+// CHECK3-IRBUILDER-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
+// CHECK3-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK3-IRBUILDER-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK3-IRBUILDER-NEXT:    store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP4]], i32* [[I]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP5]], [[TMP6]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK3-IRBUILDER:       simd.if.then:
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    [[ADD6:%.*]] = add i32 [[TMP8]], 1
+// CHECK3-IRBUILDER-NEXT:    [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i32 [[TMP10]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]]
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD8]], i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i32, i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    call void @__captured_stmt(i32* [[I5]]), !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc:
+// CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i32 [[TMP12]], 1
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// CHECK3-IRBUILDER:       omp.inner.for.end:
+// CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+// CHECK3-IRBUILDER-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], 1
+// CHECK3-IRBUILDER-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], 1
+// CHECK3-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul i32 [[DIV13]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD15:%.*]] = add i32 [[TMP13]], [[MUL14]]
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD15]], i32* [[I5]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[SIMD_IF_END]]
+// CHECK3-IRBUILDER:       simd.if.end:
+// CHECK3-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP16]], i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP17]], i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub i32 [[TMP18]], [[TMP19]]
+// CHECK3-IRBUILDER-NEXT:    [[SUB22:%.*]] = sub i32 [[SUB21]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD23:%.*]] = add i32 [[SUB22]], 1
+// CHECK3-IRBUILDER-NEXT:    [[DIV24:%.*]] = udiv i32 [[ADD23]], 1
+// CHECK3-IRBUILDER-NEXT:    [[SUB25:%.*]] = sub i32 [[DIV24]], 1
+// CHECK3-IRBUILDER-NEXT:    store i32 [[SUB25]], i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP20]], i32* [[I26]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[CMP27:%.*]] = icmp slt i32 [[TMP21]], [[TMP22]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP27]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3-IRBUILDER:       omp.precond.then:
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 [[TMP24]], i32 1, i32 1)
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM29:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM29]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK3-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP25]], 0
+// CHECK3-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.body:
+// CHECK3-IRBUILDER-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK3-IRBUILDER-NEXT:    store i32 [[TMP26]], i32* [[DOTOMP_IV16]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.cond30:
+// CHECK3-IRBUILDER-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[ADD31:%.*]] = add i32 [[TMP28]], 1
+// CHECK3-IRBUILDER-NEXT:    [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]]
+// CHECK3-IRBUILDER-NEXT:    br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body33:
+// CHECK3-IRBUILDER-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[MUL34:%.*]] = mul i32 [[TMP30]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]]
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD35]], i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[TMP31:%.*]] = load i32, i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM36]]
+// CHECK3-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX37]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.body33.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue38:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc39:
+// CHECK3-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD40]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK3-IRBUILDER:       omp.inner.for.end42:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK3-IRBUILDER:       omp.dispatch.inc:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK3-IRBUILDER:       omp.dispatch.end:
+// CHECK3-IRBUILDER-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK3-IRBUILDER-NEXT:    br i1 [[TMP34]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK3-IRBUILDER:       .omp.final.then:
+// CHECK3-IRBUILDER-NEXT:    [[TMP35:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[SUB43:%.*]] = sub i32 [[TMP36]], [[TMP37]]
+// CHECK3-IRBUILDER-NEXT:    [[SUB44:%.*]] = sub i32 [[SUB43]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD45:%.*]] = add i32 [[SUB44]], 1
+// CHECK3-IRBUILDER-NEXT:    [[DIV46:%.*]] = udiv i32 [[ADD45]], 1
+// CHECK3-IRBUILDER-NEXT:    [[MUL47:%.*]] = mul i32 [[DIV46]], 1
+// CHECK3-IRBUILDER-NEXT:    [[ADD48:%.*]] = add i32 [[TMP35]], [[MUL47]]
+// CHECK3-IRBUILDER-NEXT:    store i32 [[ADD48]], i32* [[I28]], align 4
+// CHECK3-IRBUILDER-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK3-IRBUILDER:       .omp.final.done:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK3-IRBUILDER:       omp.precond.end:
+// CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM49:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM49]])
+// CHECK3-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK3-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK3-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-IRBUILDER-NEXT:  entry:
+// CHECK3-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK3-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK3-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK3-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK3-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3]] {
+// CHECK3-IRBUILDER-NEXT:  entry:
+// CHECK3-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK3-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK3-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK4-IRBUILDER-LABEL: define {{[^@]+}}@_Z18static_not_chunkedPfS_S_S_
+// CHECK4-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK4-IRBUILDER-NEXT:  entry:
+// CHECK4-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK4-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK4-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 7
+// CHECK4-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
+// CHECK4-IRBUILDER-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[I]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[IDXPROM]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[I]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[IDXPROM3]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[I]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM6:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM6]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX7]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[I]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[TMP15]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM9]]
+// CHECK4-IRBUILDER-NEXT:    store float [[MUL8]], float* [[ARRAYIDX10]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK4-IRBUILDER:       omp.body.continue:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.inc:
+// CHECK4-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK4-IRBUILDER:       omp.inner.for.end:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.inc:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK4-IRBUILDER:       omp.dispatch.end:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
+// CHECK4-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK4-IRBUILDER-LABEL: define {{[^@]+}}@_Z8dynamic1PfS_S_S_
+// CHECK4-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK4-IRBUILDER-NEXT:  entry:
+// CHECK4-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 67, i64 0, i64 16908287, i64 1, i64 1)
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK4-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK4-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i64 [[TMP1]], i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[ADD:%.*]] = add i64 [[TMP3]], 1
+// CHECK4-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp ult i64 [[TMP2]], [[ADD]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i64 [[TMP4]], 127
+// CHECK4-IRBUILDER-NEXT:    [[ADD2:%.*]] = add i64 131071, [[MUL]]
+// CHECK4-IRBUILDER-NEXT:    store i64 [[ADD2]], i64* [[I]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP5:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, i64* [[I]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 [[TMP6]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP8:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[I]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[TMP9]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP11:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[I]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP12]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, i64* [[I]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP15]]
+// CHECK4-IRBUILDER-NEXT:    store float [[MUL7]], float* [[ARRAYIDX8]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK4-IRBUILDER:       omp.body.continue:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.inc:
+// CHECK4-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i64 [[TMP16]], 1
+// CHECK4-IRBUILDER-NEXT:    store i64 [[ADD9]], i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK4-IRBUILDER:       omp.inner.for.end:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.inc:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK4-IRBUILDER:       omp.dispatch.end:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM11]])
+// CHECK4-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK4-IRBUILDER-LABEL: define {{[^@]+}}@_Z9test_autoPfS_S_S_
+// CHECK4-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK4-IRBUILDER-NEXT:  entry:
+// CHECK4-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[Y:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK4-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK4-IRBUILDER-NEXT:    [[X6:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I8:%.*]] = alloca i8, align 1
+// CHECK4-IRBUILDER-NEXT:    [[X9:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[Y]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[Y]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP0]] to i8
+// CHECK4-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP1]] to i32
+// CHECK4-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 57, [[CONV3]]
+// CHECK4-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], 1
+// CHECK4-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK4-IRBUILDER-NEXT:    [[CONV4:%.*]] = zext i32 [[DIV]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV4]], 11
+// CHECK4-IRBUILDER-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK4-IRBUILDER-NEXT:    store i64 [[SUB5]], i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK4-IRBUILDER-NEXT:    store i8 [[TMP2]], i8* [[I]], align 1
+// CHECK4-IRBUILDER-NEXT:    store i32 11, i32* [[X6]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[CONV7:%.*]] = sext i8 [[TMP3]] to i32
+// CHECK4-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[CONV7]], 57
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK4-IRBUILDER:       omp.precond.then:
+// CHECK4-IRBUILDER-NEXT:    store i64 0, i64* [[DOTOMP_LB]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i64 [[TMP4]], i64* [[DOTOMP_UB]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 70, i64 0, i64 [[TMP5]], i64 1, i64 1)
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM10:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM10]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
+// CHECK4-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK4-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i64 [[TMP7]], i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[CMP11:%.*]] = icmp sle i64 [[TMP8]], [[TMP9]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i8, i8* [[DOTCAPTURE_EXPR_]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[CONV12:%.*]] = sext i8 [[TMP10]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[DIV13:%.*]] = sdiv i64 [[TMP11]], 11
+// CHECK4-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul nsw i64 [[DIV13]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD15:%.*]] = add nsw i64 [[CONV12]], [[MUL14]]
+// CHECK4-IRBUILDER-NEXT:    [[CONV16:%.*]] = trunc i64 [[ADD15]] to i8
+// CHECK4-IRBUILDER-NEXT:    store i8 [[CONV16]], i8* [[I8]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[DIV17:%.*]] = sdiv i64 [[TMP13]], 11
+// CHECK4-IRBUILDER-NEXT:    [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 11
+// CHECK4-IRBUILDER-NEXT:    [[SUB19:%.*]] = sub nsw i64 [[TMP12]], [[MUL18]]
+// CHECK4-IRBUILDER-NEXT:    [[MUL20:%.*]] = mul nsw i64 [[SUB19]], 1
+// CHECK4-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub nsw i64 11, [[MUL20]]
+// CHECK4-IRBUILDER-NEXT:    [[CONV22:%.*]] = trunc i64 [[SUB21]] to i32
+// CHECK4-IRBUILDER-NEXT:    store i32 [[CONV22]], i32* [[X9]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM23:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP14:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i8 [[TMP15]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP17:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM24:%.*]] = sext i8 [[TMP18]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds float, float* [[TMP17]], i64 [[IDXPROM24]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX25]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL26:%.*]] = fmul float [[TMP16]], [[TMP19]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP20:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM27:%.*]] = sext i8 [[TMP21]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[IDXPROM27]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX28]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL29:%.*]] = fmul float [[MUL26]], [[TMP22]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP23:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i8, i8* [[I8]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM30:%.*]] = sext i8 [[TMP24]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds float, float* [[TMP23]], i64 [[IDXPROM30]]
+// CHECK4-IRBUILDER-NEXT:    store float [[MUL29]], float* [[ARRAYIDX31]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK4-IRBUILDER:       omp.body.continue:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.inc:
+// CHECK4-IRBUILDER-NEXT:    [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[ADD32:%.*]] = add nsw i64 [[TMP25]], 1
+// CHECK4-IRBUILDER-NEXT:    store i64 [[ADD32]], i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK4-IRBUILDER:       omp.inner.for.end:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.inc:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK4-IRBUILDER:       omp.dispatch.end:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK4-IRBUILDER:       omp.precond.end:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM34:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM34]])
+// CHECK4-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK4-IRBUILDER-LABEL: define {{[^@]+}}@_Z7runtimePfS_S_S_
+// CHECK4-IRBUILDER-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK4-IRBUILDER-NEXT:  entry:
+// CHECK4-IRBUILDER-NEXT:    [[A_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[B_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[C_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK4-IRBUILDER-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+// CHECK4-IRBUILDER-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I:%.*]] = alloca i8, align 1
+// CHECK4-IRBUILDER-NEXT:    [[X2:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[X]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 69, i32 0, i32 199, i32 1, i32 1)
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK4-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+// CHECK4-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP4]], 20
+// CHECK4-IRBUILDER-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD:%.*]] = add nsw i32 48, [[MUL]]
+// CHECK4-IRBUILDER-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i8
+// CHECK4-IRBUILDER-NEXT:    store i8 [[CONV]], i8* [[I]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP6]], 20
+// CHECK4-IRBUILDER-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 20
+// CHECK4-IRBUILDER-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP5]], [[MUL5]]
+// CHECK4-IRBUILDER-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD7:%.*]] = add nsw i32 -10, [[MUL6]]
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD7]], i32* [[X2]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float*, float** [[B_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, i8* [[I]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP7]], i64 [[IDXPROM]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, float* [[ARRAYIDX]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float*, float** [[C_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, i8* [[I]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[TMP10]], i64 [[IDXPROM9]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX10]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, i8* [[I]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[IDXPROM12]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
+// CHECK4-IRBUILDER-NEXT:    [[TMP16:%.*]] = load float*, float** [[A_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, i8* [[I]], align 1
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[TMP16]], i64 [[IDXPROM15]]
+// CHECK4-IRBUILDER-NEXT:    store float [[MUL14]], float* [[ARRAYIDX16]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK4-IRBUILDER:       omp.body.continue:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.inc:
+// CHECK4-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM18:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM18]])
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK4-IRBUILDER:       omp.inner.for.end:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.inc:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK4-IRBUILDER:       omp.dispatch.end:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM19:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM19]])
+// CHECK4-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK4-IRBUILDER-LABEL: define {{[^@]+}}@_Z8foo_simdii
+// CHECK4-IRBUILDER-SAME: (i32 noundef [[LOW:%.*]], i32 noundef [[UP:%.*]]) #[[ATTR0]] {
+// CHECK4-IRBUILDER-NEXT:  entry:
+// CHECK4-IRBUILDER-NEXT:    [[LOW_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[UP_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I5:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IV16:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[_TMP17:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_18:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTCAPTURE_EXPR_20:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I26:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    [[I28:%.*]] = alloca i32, align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[SUB:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+// CHECK4-IRBUILDER-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
+// CHECK4-IRBUILDER-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
+// CHECK4-IRBUILDER-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK4-IRBUILDER-NEXT:    store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP4]], i32* [[I]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP5]], [[TMP6]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK4-IRBUILDER:       simd.if.then:
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    [[ADD6:%.*]] = add i32 [[TMP8]], 1
+// CHECK4-IRBUILDER-NEXT:    [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    [[MUL:%.*]] = mul i32 [[TMP10]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]]
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD8]], i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i32, i32* [[I5]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK4-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    call void @__captured_stmt(i32* [[I5]]), !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK4-IRBUILDER:       omp.body.continue:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.inc:
+// CHECK4-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    [[ADD9:%.*]] = add i32 [[TMP12]], 1
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !3
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// CHECK4-IRBUILDER:       omp.inner.for.end:
+// CHECK4-IRBUILDER-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+// CHECK4-IRBUILDER-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], 1
+// CHECK4-IRBUILDER-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], 1
+// CHECK4-IRBUILDER-NEXT:    [[MUL14:%.*]] = mul i32 [[DIV13]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD15:%.*]] = add i32 [[TMP13]], [[MUL14]]
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD15]], i32* [[I5]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[SIMD_IF_END]]
+// CHECK4-IRBUILDER:       simd.if.end:
+// CHECK4-IRBUILDER-NEXT:    [[TMP16:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP16]], i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i32, i32* [[UP_ADDR]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP17]], i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[SUB21:%.*]] = sub i32 [[TMP18]], [[TMP19]]
+// CHECK4-IRBUILDER-NEXT:    [[SUB22:%.*]] = sub i32 [[SUB21]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD23:%.*]] = add i32 [[SUB22]], 1
+// CHECK4-IRBUILDER-NEXT:    [[DIV24:%.*]] = udiv i32 [[ADD23]], 1
+// CHECK4-IRBUILDER-NEXT:    [[SUB25:%.*]] = sub i32 [[DIV24]], 1
+// CHECK4-IRBUILDER-NEXT:    store i32 [[SUB25]], i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP20]], i32* [[I26]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[CMP27:%.*]] = icmp slt i32 [[TMP21]], [[TMP22]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP27]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK4-IRBUILDER:       omp.precond.then:
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_UB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 66, i32 0, i32 [[TMP24]], i32 1, i32 1)
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM29:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM29]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK4-IRBUILDER-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP25]], 0
+// CHECK4-IRBUILDER-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.body:
+// CHECK4-IRBUILDER-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK4-IRBUILDER-NEXT:    store i32 [[TMP26]], i32* [[DOTOMP_IV16]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.cond30:
+// CHECK4-IRBUILDER-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[ADD31:%.*]] = add i32 [[TMP28]], 1
+// CHECK4-IRBUILDER-NEXT:    [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]]
+// CHECK4-IRBUILDER-NEXT:    br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body33:
+// CHECK4-IRBUILDER-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[MUL34:%.*]] = mul i32 [[TMP30]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]]
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD35]], i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[TMP31:%.*]] = load i32, i32* [[I28]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM36]]
+// CHECK4-IRBUILDER-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX37]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.body33.ordered.after:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
+// CHECK4-IRBUILDER:       omp.body.continue38:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
+// CHECK4-IRBUILDER:       omp.inner.for.inc39:
+// CHECK4-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD40]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK4-IRBUILDER:       omp.inner.for.end42:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK4-IRBUILDER:       omp.dispatch.inc:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK4-IRBUILDER:       omp.dispatch.end:
+// CHECK4-IRBUILDER-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+// CHECK4-IRBUILDER-NEXT:    br i1 [[TMP34]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK4-IRBUILDER:       .omp.final.then:
+// CHECK4-IRBUILDER-NEXT:    [[TMP35:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[SUB43:%.*]] = sub i32 [[TMP36]], [[TMP37]]
+// CHECK4-IRBUILDER-NEXT:    [[SUB44:%.*]] = sub i32 [[SUB43]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD45:%.*]] = add i32 [[SUB44]], 1
+// CHECK4-IRBUILDER-NEXT:    [[DIV46:%.*]] = udiv i32 [[ADD45]], 1
+// CHECK4-IRBUILDER-NEXT:    [[MUL47:%.*]] = mul i32 [[DIV46]], 1
+// CHECK4-IRBUILDER-NEXT:    [[ADD48:%.*]] = add i32 [[TMP35]], [[MUL47]]
+// CHECK4-IRBUILDER-NEXT:    store i32 [[ADD48]], i32* [[I28]], align 4
+// CHECK4-IRBUILDER-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK4-IRBUILDER:       .omp.final.done:
+// CHECK4-IRBUILDER-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK4-IRBUILDER:       omp.precond.end:
+// CHECK4-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM49:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[OMP_GLOBAL_THREAD_NUM49]])
+// CHECK4-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK4-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt
+// CHECK4-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK4-IRBUILDER-NEXT:  entry:
+// CHECK4-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK4-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK4-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK4-IRBUILDER-NEXT:    ret void
+//
+//
+// CHECK4-IRBUILDER-LABEL: define {{[^@]+}}@__captured_stmt.1
+// CHECK4-IRBUILDER-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3]] {
+// CHECK4-IRBUILDER-NEXT:  entry:
+// CHECK4-IRBUILDER-NEXT:    [[I_ADDR:%.*]] = alloca i32*, align 8
+// CHECK4-IRBUILDER-NEXT:    store i32* [[I]], i32** [[I_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK4-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* @f, i64 0, i64 [[IDXPROM]]
+// CHECK4-IRBUILDER-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX]], align 4
+// CHECK4-IRBUILDER-NEXT:    ret void
+//
+//
 // CHECK5-LABEL: define {{[^@]+}}@_Z18static_not_chunkedPfS_S_S_
 // CHECK5-SAME: (float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK5-NEXT:  entry:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -23,6 +23,52 @@
 namespace llvm {
 class CanonicalLoopInfo;
 
+/// Move the instruction after an InsertPoint to the beginning of another
+/// BasicBlock.
+///
+/// The instructions after \p IP are moved to the beginning of \p New which must
+/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
+/// \p New will be added such that there is no semantic change. Otherwise, the
+/// \p IP insert block remains degenerate and it is up to the caller to insert a
+/// terminator.
+void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
+              bool CreateBranch);
+
+/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
+/// insert location will stick to after the instruction before the insertion
+/// point (instead of moving with the instruction the InsertPoint stores
+/// internally).
+void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);
+
+/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
+/// (missing the terminator).
+///
+/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
+/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
+/// is true, a branch to the new successor will new created such that
+/// semantically there is no change; otherwise the block of the insertion point
+/// remains degenerate and it is the caller's responsibility to insert a
+/// terminator. Returns the new successor block.
+BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
+                    llvm::Twine Name = {});
+
+/// Split a BasicBlock at \p Builder's insertion point, even if the block is
+/// degenerate (missing the terminator).  Its new insert location will stick to
+/// after the instruction before the insertion point (instead of moving with the
+/// instruction the InsertPoint stores internally).
+BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
+                    llvm::Twine Name = {});
+
+/// Split a BasicBlock at \p Builder's insertion point, even if the block is
+/// degenerate (missing the terminator).  Its new insert location will stick to
+/// after the instruction before the insertion point (instead of moving with the
+/// instruction the InsertPoint stores internally).
+BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);
+
+/// Like splitBB, but reuses the current block's name for the new name.
+BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
+                              llvm::Twine Suffix = ".split");
+
 /// An interface to create LLVM-IR for OpenMP directives.
 ///
 /// Each OpenMP directive has a corresponding public generator method.
@@ -87,27 +133,36 @@
   /// Callback type for body (=inner region) code generation
   ///
   /// The callback takes code locations as arguments, each describing a
-  /// location at which code might need to be generated or a location that is
-  /// the target of control transfer.
+  /// location where additional instructions can be inserted.
+  ///
+  /// The CodeGenIP may be in the middle of a basic block or point to the end of
+  /// it. The basic block may have a terminator or be degenerate. The callback
+  /// function may just insert instructions at that position, but also split the
+  /// block (without the Before argument of BasicBlock::splitBasicBlock such
+  /// that the identify of the split predecessor block is preserved) and insert
+  /// additional control flow, including branches that do not lead back to what
+  /// follows the CodeGenIP. Note that since the callback is allowed to split
+  /// the block, callers must assume that InsertPoints to positions in the
+  /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
+  /// such InsertPoints need to be preserved, it can split the block itself
+  /// before calling the callback.
+  ///
+  /// AllocaIP and CodeGenIP must not point to the same position.
   ///
   /// \param AllocaIP is the insertion point at which new alloca instructions
-  ///                 should be placed.
+  ///                 should be placed. The BasicBlock it is pointing to must
+  ///                 not be split.
   /// \param CodeGenIP is the insertion point at which the body code should be
   ///                  placed.
-  /// \param ContinuationBB is the basic block target to leave the body.
-  ///
-  /// Note that all blocks pointed to by the arguments have terminators.
   using BodyGenCallbackTy =
-      function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                        BasicBlock &ContinuationBB)>;
+      function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
 
   // This is created primarily for sections construct as llvm::function_ref
   // (BodyGenCallbackTy) is not storable (as described in the comments of
   // function_ref class - function_ref contains non-ownable reference
   // to the callable.
   using StorableBodyGenCallbackTy =
-      std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                         BasicBlock &ContinuationBB)>;
+      std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
 
   /// Callback type for loop body code generation.
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -252,16 +252,8 @@
   NewBr->setDebugLoc(DL);
 }
 
-/// Move the instruction after an InsertPoint to the beginning of another
-/// BasicBlock.
-///
-/// The instructions after \p IP are moved to the beginning of \p New which must
-/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
-/// \p New will be added such that there is no semantic change. Otherwise, the
-/// \p IP insert block remains degenerate and it is up to the caller to insert a
-/// terminator.
-static void spliceBB(OpenMPIRBuilder::InsertPointTy IP, BasicBlock *New,
-                     bool CreateBranch) {
+void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
+                    bool CreateBranch) {
   assert(New->getFirstInsertionPt() == New->begin() &&
          "Target BB must not have PHI nodes");
 
@@ -274,11 +266,7 @@
     BranchInst::Create(New, Old);
 }
 
-/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
-/// insert location will stick to after the instruction before the insertion
-/// point (instead of moving with the instruction the InsertPoint stores
-/// internally).
-static void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
+void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
   DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
   BasicBlock *Old = Builder.GetInsertBlock();
 
@@ -293,17 +281,8 @@
   Builder.SetCurrentDebugLocation(DebugLoc);
 }
 
-/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
-/// (missing the terminator).
-///
-/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
-/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
-/// is true, a branch to the new successor will new created such that
-/// semantically there is no change; otherwise the block of the insertion point
-/// remains degenerate and it is the caller's responsibility to insert a
-/// terminator. Returns the new successor block.
-static BasicBlock *splitBB(OpenMPIRBuilder::InsertPointTy IP, bool CreateBranch,
-                           llvm::Twine Name = {}) {
+BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
+                          llvm::Twine Name) {
   BasicBlock *Old = IP.getBlock();
   BasicBlock *New = BasicBlock::Create(
       Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
@@ -313,12 +292,22 @@
   return New;
 }
 
-/// Split a BasicBlock at \p Builder's insertion point, even if the block is
-/// degenerate (missing the terminator).  Its new insert location will stick to
-/// after the instruction before the insertion point (instead of moving with the
-/// instruction the InsertPoint stores internally).
-static BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch,
-                           llvm::Twine Name = {}) {
+BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
+                          llvm::Twine Name) {
+  DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
+  BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
+  if (CreateBranch)
+    Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+  else
+    Builder.SetInsertPoint(Builder.GetInsertBlock());
+  // SetInsertPoint also updates the Builder's debug location, but we want to
+  // keep the one the Builder was configured to use.
+  Builder.SetCurrentDebugLocation(DebugLoc);
+  return New;
+}
+
+BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
+                          llvm::Twine Name) {
   DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
   BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
   if (CreateBranch)
@@ -331,6 +320,12 @@
   return New;
 }
 
+BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
+                                    llvm::Twine Suffix) {
+  BasicBlock *Old = Builder.GetInsertBlock();
+  return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
+}
+
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
   LLVMContext &Ctx = Fn.getContext();
 
@@ -928,7 +923,7 @@
   // Let the caller create the body.
   assert(BodyGenCB && "Expected body generation callback!");
   InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
-  BodyGenCB(InnerAllocaIP, CodeGenIP, *PRegPreFiniBB);
+  BodyGenCB(InnerAllocaIP, CodeGenIP);
 
   LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
 
@@ -1268,26 +1263,25 @@
   // section_loop.after:
   // <FiniCB>;
   auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
-    auto *CurFn = CodeGenIP.getBlock()->getParent();
-    auto *ForIncBB = CodeGenIP.getBlock()->getSingleSuccessor();
-    auto *ForExitBB = CodeGenIP.getBlock()
-                          ->getSinglePredecessor()
-                          ->getTerminator()
-                          ->getSuccessor(1);
-    SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, ForIncBB);
     Builder.restoreIP(CodeGenIP);
+    BasicBlock *Continue =
+        splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
+    Function *CurFn = Continue->getParent();
+    SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
+
     unsigned CaseNumber = 0;
     for (auto SectionCB : SectionCBs) {
-      auto *CaseBB = BasicBlock::Create(M.getContext(),
-                                        "omp_section_loop.body.case", CurFn);
+      BasicBlock *CaseBB = BasicBlock::Create(
+          M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
       SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
       Builder.SetInsertPoint(CaseBB);
-      SectionCB(InsertPointTy(), Builder.saveIP(), *ForExitBB);
+      BranchInst *CaseEndBr = Builder.CreateBr(Continue);
+      SectionCB(InsertPointTy(),
+                {CaseEndBr->getParent(), CaseEndBr->getIterator()});
       CaseNumber++;
     }
     // remove the existing terminator from body BB since there can be no
     // terminators after switch/case
-    CodeGenIP.getBlock()->getTerminator()->eraseFromParent();
   };
   // Loop body ends here
   // LowerBound, UpperBound, and STride for createCanonicalLoop
@@ -1297,29 +1291,22 @@
   Value *ST = ConstantInt::get(I32Ty, 1);
   llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop(
       Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
-  Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
-  AllocaIP = Builder.saveIP();
   InsertPointTy AfterIP =
       applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
-  BasicBlock *LoopAfterBB = AfterIP.getBlock();
-  Instruction *SplitPos = LoopAfterBB->getTerminator();
-  if (!isa_and_nonnull<BranchInst>(SplitPos))
-    SplitPos = new UnreachableInst(Builder.getContext(), LoopAfterBB);
-  // ExitBB after LoopAfterBB because LoopAfterBB is used for FinalizationCB,
-  // which requires a BB with branch
-  BasicBlock *ExitBB =
-      LoopAfterBB->splitBasicBlock(SplitPos, "omp_sections.end");
-  SplitPos->eraseFromParent();
 
   // Apply the finalization callback in LoopAfterBB
   auto FiniInfo = FinalizationStack.pop_back_val();
   assert(FiniInfo.DK == OMPD_sections &&
          "Unexpected finalization stack state!");
-  Builder.SetInsertPoint(LoopAfterBB->getTerminator());
-  FiniInfo.FiniCB(Builder.saveIP());
-  Builder.SetInsertPoint(ExitBB);
+  if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
+    Builder.restoreIP(AfterIP);
+    BasicBlock *FiniBB =
+        splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
+    CB(Builder.saveIP());
+    AfterIP = {FiniBB, FiniBB->begin()};
+  }
 
-  return Builder.saveIP();
+  return AfterIP;
 }
 
 OpenMPIRBuilder::InsertPointTy
@@ -3117,48 +3104,28 @@
 
   // generate body
   BodyGenCB(/* AllocaIP */ InsertPointTy(),
-            /* CodeGenIP */ Builder.saveIP(), *FiniBB);
-
-  // If we didn't emit a branch to FiniBB during body generation, it means
-  // FiniBB is unreachable (e.g. while(1);). stop generating all the
-  // unreachable blocks, and remove anything we are not going to use.
-  auto SkipEmittingRegion = FiniBB->hasNPredecessors(0);
-  if (SkipEmittingRegion) {
-    FiniBB->eraseFromParent();
-    ExitCall->eraseFromParent();
-    // Discard finalization if we have it.
-    if (HasFinalize) {
-      assert(!FinalizationStack.empty() &&
-             "Unexpected finalization stack state!");
-      FinalizationStack.pop_back();
-    }
-  } else {
-    // emit exit call and do any needed finalization.
-    auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
-    assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
-           FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
-           "Unexpected control flow graph state!!");
-    emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
-    assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
-           "Unexpected Control Flow State!");
-    MergeBlockIntoPredecessor(FiniBB);
-  }
+            /* CodeGenIP */ Builder.saveIP());
+
+  // emit exit call and do any needed finalization.
+  auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
+  assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
+         FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
+         "Unexpected control flow graph state!!");
+  emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
+  assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
+         "Unexpected Control Flow State!");
+  MergeBlockIntoPredecessor(FiniBB);
 
   // If we are skipping the region of a non conditional, remove the exit
   // block, and clear the builder's insertion point.
   assert(SplitPos->getParent() == ExitBB &&
          "Unexpected Insertion point location!");
-  if (!Conditional && SkipEmittingRegion) {
-    ExitBB->eraseFromParent();
-    Builder.ClearInsertionPoint();
-  } else {
-    auto merged = MergeBlockIntoPredecessor(ExitBB);
-    BasicBlock *ExitPredBB = SplitPos->getParent();
-    auto InsertBB = merged ? ExitPredBB : ExitBB;
-    if (!isa_and_nonnull<BranchInst>(SplitPos))
-      SplitPos->eraseFromParent();
-    Builder.SetInsertPoint(InsertBB);
-  }
+  auto merged = MergeBlockIntoPredecessor(ExitBB);
+  BasicBlock *ExitPredBB = SplitPos->getParent();
+  auto InsertBB = merged ? ExitPredBB : ExitBB;
+  if (!isa_and_nonnull<BranchInst>(SplitPos))
+    SplitPos->eraseFromParent();
+  Builder.SetInsertPoint(InsertBB);
 
   return Builder.saveIP();
 }
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -935,8 +935,7 @@
     SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
 
     BasicBlock *StartBB = nullptr, *EndBB = nullptr;
-    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                         BasicBlock &ContinuationIP) {
+    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
       BasicBlock *CGStartBB = CodeGenIP.getBlock();
       BasicBlock *CGEndBB =
           SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
@@ -975,8 +974,7 @@
       const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
       ParentBB->getTerminator()->eraseFromParent();
 
-      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                           BasicBlock &ContinuationIP) {
+      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
         BasicBlock *CGStartBB = CodeGenIP.getBlock();
         BasicBlock *CGEndBB =
             SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -603,8 +603,7 @@
   unsigned NumPrivatizedVars = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &ContinuationIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumBodiesGenerated;
 
     Builder.restoreIP(AllocaIP);
@@ -618,10 +617,6 @@
     Instruction *ThenTerm, *ElseTerm;
     SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(),
                                   &ThenTerm, &ElseTerm);
-
-    Builder.SetInsertPoint(ThenTerm);
-    Builder.CreateBr(&ContinuationIP);
-    ThenTerm->eraseFromParent();
   };
 
   auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
@@ -711,8 +706,7 @@
   unsigned NumOuterBodiesGenerated = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                            BasicBlock &ContinuationIP) {
+  auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumInnerBodiesGenerated;
   };
 
@@ -731,8 +725,7 @@
 
   auto FiniCB = [&](InsertPointTy CodeGenIP) { ++NumFinalizationPoints; };
 
-  auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                            BasicBlock &ContinuationIP) {
+  auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumOuterBodiesGenerated;
     Builder.restoreIP(CodeGenIP);
     BasicBlock *CGBB = CodeGenIP.getBlock();
@@ -807,8 +800,7 @@
   unsigned NumOuterBodiesGenerated = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                            BasicBlock &ContinuationIP) {
+  auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumInnerBodiesGenerated;
   };
 
@@ -827,8 +819,7 @@
 
   auto FiniCB = [&](InsertPointTy CodeGenIP) { ++NumFinalizationPoints; };
 
-  auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                            BasicBlock &ContinuationIP) {
+  auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumOuterBodiesGenerated;
     Builder.restoreIP(CodeGenIP);
     BasicBlock *CGBB = CodeGenIP.getBlock();
@@ -920,8 +911,7 @@
   unsigned NumPrivatizedVars = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &ContinuationIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumBodiesGenerated;
 
     Builder.restoreIP(AllocaIP);
@@ -933,12 +923,8 @@
         Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
     Value *Cmp = Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
     Instruction *ThenTerm, *ElseTerm;
-    SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(),
-                                  &ThenTerm, &ElseTerm);
-
-    Builder.SetInsertPoint(ThenTerm);
-    Builder.CreateBr(&ContinuationIP);
-    ThenTerm->eraseFromParent();
+    SplitBlockAndInsertIfThenElse(Cmp, &*Builder.GetInsertPoint(), &ThenTerm,
+                                  &ElseTerm);
   };
 
   auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
@@ -1046,8 +1032,7 @@
   unsigned NumFinalizationPoints = 0;
 
   CallInst *CheckedBarrier = nullptr;
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &ContinuationIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumBodiesGenerated;
 
     Builder.restoreIP(CodeGenIP);
@@ -1178,8 +1163,7 @@
   Value *StructPtrVal = Builder.CreateCall(RetStructPtrFunc);
 
   Instruction *Internal;
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &ContinuationBB) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     IRBuilder<>::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
     Internal = Builder.CreateCall(TakeI32Func, I32Val);
@@ -2309,11 +2293,9 @@
   AllocaInst *PrivAI = nullptr;
 
   BasicBlock *EntryBB = nullptr;
-  BasicBlock *ExitBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     if (AllocaIP.isSet())
       Builder.restoreIP(AllocaIP);
     else
@@ -2328,7 +2310,6 @@
     Builder.restoreIP(CodeGenIP);
 
     // collect some info for checks later
-    ExitBB = FiniBB.getUniqueSuccessor();
     ThenBB = Builder.GetInsertBlock();
     EntryBB = ThenBB->getUniquePredecessor();
 
@@ -2350,7 +2331,7 @@
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  EXPECT_EQ(ThenBB->getUniqueSuccessor(), ExitBB);
+  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -2389,11 +2370,9 @@
   AllocaInst *PrivAI = nullptr;
 
   BasicBlock *EntryBB = nullptr;
-  BasicBlock *ExitBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     if (AllocaIP.isSet())
       Builder.restoreIP(AllocaIP);
     else
@@ -2408,7 +2387,6 @@
     Builder.restoreIP(CodeGenIP);
 
     // collect some info for checks later
-    ExitBB = FiniBB.getUniqueSuccessor();
     ThenBB = Builder.GetInsertBlock();
     EntryBB = ThenBB->getUniquePredecessor();
 
@@ -2432,7 +2410,7 @@
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  EXPECT_EQ(ThenBB->getUniqueSuccessor(), ExitBB);
+  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -2470,18 +2448,11 @@
 
   AllocaInst *PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
 
-  BasicBlock *EntryBB = nullptr;
-
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
-    // collect some info for checks later
-    EntryBB = FiniBB.getUniquePredecessor();
-
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     // actual start for bodyCB
     llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
     llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
     EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
-    EXPECT_EQ(EntryBB, CodeGenIPBB);
 
     // body begin
     Builder.restoreIP(CodeGenIP);
@@ -2495,13 +2466,11 @@
     BasicBlock *IPBB = IP.getBlock();
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
+  BasicBlock *EntryBB = Builder.GetInsertBlock();
 
   Builder.restoreIP(OMPBuilder.createCritical(Builder, BodyGenCB, FiniCB,
                                               "testCRT", nullptr));
 
-  Value *EntryBBTI = EntryBB->getTerminator();
-  EXPECT_EQ(EntryBBTI, nullptr);
-
   CallInst *CriticalEntryCI = nullptr;
   for (auto &EI : *EntryBB) {
     Instruction *cur = &EI;
@@ -2720,16 +2689,10 @@
   AllocaInst *PrivAI =
       Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst");
 
-  BasicBlock *EntryBB = nullptr;
-
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
-    EntryBB = FiniBB.getUniquePredecessor();
-
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
     llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
     EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
-    EXPECT_EQ(EntryBB, CodeGenIPBB);
 
     Builder.restoreIP(CodeGenIP);
     Builder.CreateStore(F->arg_begin(), PrivAI);
@@ -2744,6 +2707,7 @@
   };
 
   // Test for "#omp ordered [threads]"
+  BasicBlock *EntryBB = Builder.GetInsertBlock();
   Builder.restoreIP(
       OMPBuilder.createOrderedThreadsSimd(Builder, BodyGenCB, FiniCB, true));
 
@@ -2796,16 +2760,10 @@
   AllocaInst *PrivAI =
       Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst");
 
-  BasicBlock *EntryBB = nullptr;
-
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
-    EntryBB = FiniBB.getUniquePredecessor();
-
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
     llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
     EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
-    EXPECT_EQ(EntryBB, CodeGenIPBB);
 
     Builder.restoreIP(CodeGenIP);
     Builder.CreateStore(F->arg_begin(), PrivAI);
@@ -2820,6 +2778,7 @@
   };
 
   // Test for "#omp ordered simd"
+  BasicBlock *EntryBB = Builder.GetInsertBlock();
   Builder.restoreIP(
       OMPBuilder.createOrderedThreadsSimd(Builder, BodyGenCB, FiniCB, false));
 
@@ -2903,11 +2862,9 @@
   AllocaInst *PrivAI = nullptr;
 
   BasicBlock *EntryBB = nullptr;
-  BasicBlock *ExitBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     if (AllocaIP.isSet())
       Builder.restoreIP(AllocaIP);
     else
@@ -2922,7 +2879,6 @@
     Builder.restoreIP(CodeGenIP);
 
     // collect some info for checks later
-    ExitBB = FiniBB.getUniqueSuccessor();
     ThenBB = Builder.GetInsertBlock();
     EntryBB = ThenBB->getUniquePredecessor();
 
@@ -2945,7 +2901,7 @@
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  EXPECT_EQ(ThenBB->getUniqueSuccessor(), ExitBB);
+  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -2996,11 +2952,9 @@
   AllocaInst *PrivAI = nullptr;
 
   BasicBlock *EntryBB = nullptr;
-  BasicBlock *ExitBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     if (AllocaIP.isSet())
       Builder.restoreIP(AllocaIP);
     else
@@ -3015,7 +2969,6 @@
     Builder.restoreIP(CodeGenIP);
 
     // collect some info for checks later
-    ExitBB = FiniBB.getUniqueSuccessor();
     ThenBB = Builder.GetInsertBlock();
     EntryBB = ThenBB->getUniquePredecessor();
 
@@ -3038,7 +2991,7 @@
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  EXPECT_EQ(ThenBB->getUniqueSuccessor(), ExitBB);
+  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3718,8 +3671,7 @@
   //   xor of thread-id;
   // and store the result in global variables.
   InsertPointTy BodyIP, BodyAllocaIP;
-  auto BodyGenCB = [&](InsertPointTy InnerAllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &ContinuationBB) {
+  auto BodyGenCB = [&](InsertPointTy InnerAllocaIP, InsertPointTy CodeGenIP) {
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
@@ -3958,8 +3910,7 @@
 
   InsertPointTy FirstBodyIP, FirstBodyAllocaIP;
   auto FirstBodyGenCB = [&](InsertPointTy InnerAllocaIP,
-                            InsertPointTy CodeGenIP,
-                            BasicBlock &ContinuationBB) {
+                            InsertPointTy CodeGenIP) {
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
@@ -3979,8 +3930,7 @@
 
   InsertPointTy SecondBodyIP, SecondBodyAllocaIP;
   auto SecondBodyGenCB = [&](InsertPointTy InnerAllocaIP,
-                             InsertPointTy CodeGenIP,
-                             BasicBlock &ContinuationBB) {
+                             InsertPointTy CodeGenIP) {
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
@@ -4126,11 +4076,7 @@
   llvm::SmallVector<BasicBlock *, 4> CaseBBs;
 
   auto FiniCB = [&](InsertPointTy IP) {};
-  auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
-    Builder.restoreIP(CodeGenIP);
-    Builder.CreateBr(&FiniBB);
-  };
+  auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
   SectionCBVector.push_back(SectionCB);
 
   auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
@@ -4158,8 +4104,6 @@
   llvm::SmallVector<BasicBlock *, 4> CaseBBs;
 
   BasicBlock *SwitchBB = nullptr;
-  BasicBlock *ForExitBB = nullptr;
-  BasicBlock *ForIncBB = nullptr;
   AllocaInst *PrivAI = nullptr;
   SwitchInst *Switch = nullptr;
 
@@ -4173,8 +4117,7 @@
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                       BasicBlock &FiniBB) {
+  auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     ++NumBodiesGenerated;
     CaseBBs.push_back(CodeGenIP.getBlock());
     SwitchBB = CodeGenIP.getBlock()->getSinglePredecessor();
@@ -4183,9 +4126,6 @@
     Value *PrivLoad =
         Builder.CreateLoad(F->arg_begin()->getType(), PrivAI, "local.alloca");
     Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
-    Builder.CreateBr(&FiniBB);
-    ForIncBB =
-        CodeGenIP.getBlock()->getSinglePredecessor()->getSingleSuccessor();
   };
   auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
                    llvm::Value &, llvm::Value &Val, llvm::Value *&ReplVal) {
@@ -4204,7 +4144,7 @@
 
   // Switch BB's predecessor is loop condition BB, whose successor at index 1 is
   // loop's exit BB
-  ForExitBB =
+  BasicBlock *ForExitBB =
       SwitchBB->getSinglePredecessor()->getTerminator()->getSuccessor(1);
   EXPECT_NE(ForExitBB, nullptr);
 
@@ -4213,7 +4153,6 @@
   EXPECT_EQ(F, OutlinedFn);
   EXPECT_FALSE(verifyModule(*M, &errs()));
   EXPECT_EQ(OutlinedFn->arg_size(), 1U);
-  EXPECT_EQ(OutlinedFn->getBasicBlockList().size(), size_t(11));
 
   BasicBlock *LoopPreheaderBB =
       OutlinedFn->getEntryBlock().getSingleSuccessor();
@@ -4254,13 +4193,10 @@
   EXPECT_EQ(isa<SwitchInst>(SwitchBB->getTerminator()), true);
   Switch = cast<SwitchInst>(SwitchBB->getTerminator());
   EXPECT_EQ(Switch->getNumCases(), 2U);
-  EXPECT_NE(ForIncBB, nullptr);
-  EXPECT_EQ(Switch->getSuccessor(0), ForIncBB);
 
   EXPECT_EQ(CaseBBs.size(), 2U);
   for (auto *&CaseBB : CaseBBs) {
     EXPECT_EQ(CaseBB->getParent(), OutlinedFn);
-    EXPECT_EQ(CaseBB->getSingleSuccessor(), ForExitBB);
   }
 
   ASSERT_EQ(NumBodiesGenerated, 2U);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -95,6 +95,8 @@
   // also be used for alloca insertion which would result in insertion order
   // confusion. Create a new BasicBlock for the Builder and use the entry block
   // for the allocs.
+  // TODO: Create a dedicated alloca BasicBlock at function creation such that
+  // we do not need to move the current InertPoint here.
   if (builder.GetInsertBlock() ==
       &builder.GetInsertBlock()->getParent()->getEntryBlock()) {
     assert(builder.GetInsertPoint() == builder.GetInsertBlock()->end() &&
@@ -117,11 +119,14 @@
 /// region, and a branch from any block with an successor-less OpenMP terminator
 /// to `continuationBlock`. Populates `continuationBlockPHIs` with the PHI nodes
 /// of the continuation block if provided.
-static void convertOmpOpRegions(
-    Region &region, StringRef blockName, llvm::BasicBlock &sourceBlock,
-    llvm::BasicBlock &continuationBlock, llvm::IRBuilderBase &builder,
+static llvm::BasicBlock *convertOmpOpRegions(
+    Region &region, StringRef blockName, llvm::IRBuilderBase &builder,
     LLVM::ModuleTranslation &moduleTranslation, LogicalResult &bodyGenStatus,
     SmallVectorImpl<llvm::PHINode *> *continuationBlockPHIs = nullptr) {
+  llvm::BasicBlock *continuationBlock =
+      splitBB(builder, true, "omp.region.cont");
+  llvm::BasicBlock *sourceBlock = builder.GetInsertBlock();
+
   llvm::LLVMContext &llvmContext = builder.getContext();
   for (Block &bb : region) {
     llvm::BasicBlock *llvmBB = llvm::BasicBlock::Create(
@@ -130,7 +135,7 @@
     moduleTranslation.mapBlock(&bb, llvmBB);
   }
 
-  llvm::Instruction *sourceTerminator = sourceBlock.getTerminator();
+  llvm::Instruction *sourceTerminator = sourceBlock->getTerminator();
 
   // Terminators (namely YieldOp) may be forwarding values to the region that
   // need to be available in the continuation block. Collect the types of these
@@ -170,7 +175,7 @@
   if (continuationBlockPHIs) {
     llvm::IRBuilderBase::InsertPointGuard guard(builder);
     continuationBlockPHIs->reserve(continuationBlockPHITypes.size());
-    builder.SetInsertPoint(&continuationBlock, continuationBlock.begin());
+    builder.SetInsertPoint(continuationBlock, continuationBlock->begin());
     for (llvm::Type *ty : continuationBlockPHITypes)
       continuationBlockPHIs->push_back(builder.CreatePHI(ty, numYields));
   }
@@ -186,7 +191,7 @@
     if (bb->isEntryBlock()) {
       assert(sourceTerminator->getNumSuccessors() == 1 &&
              "provided entry block has multiple successors");
-      assert(sourceTerminator->getSuccessor(0) == &continuationBlock &&
+      assert(sourceTerminator->getSuccessor(0) == continuationBlock &&
              "ContinuationBlock is not the successor of the entry block");
       sourceTerminator->setSuccessor(0, llvmBB);
     }
@@ -195,7 +200,7 @@
     if (failed(
             moduleTranslation.convertBlock(*bb, bb->isEntryBlock(), builder))) {
       bodyGenStatus = failure();
-      return;
+      return continuationBlock;
     }
 
     // Special handling for `omp.yield` and `omp.terminator` (we may have more
@@ -207,7 +212,7 @@
     // in the same code that handles the region-owning operation.
     Operation *terminator = bb->getTerminator();
     if (isa<omp::TerminatorOp, omp::YieldOp>(terminator)) {
-      builder.CreateBr(&continuationBlock);
+      builder.CreateBr(continuationBlock);
 
       for (unsigned i = 0, e = terminator->getNumOperands(); i < e; ++i)
         (*continuationBlockPHIs)[i]->addIncoming(
@@ -223,6 +228,8 @@
   // be converted several times, that is cloned, without clashes, and slightly
   // speeds up the lookups.
   moduleTranslation.forgetMapping(region);
+
+  return continuationBlock;
 }
 
 /// Convert ProcBindKind from MLIR-generated enum to LLVM enum.
@@ -249,16 +256,15 @@
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
-                       llvm::BasicBlock &continuationBlock) {
+  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Save the alloca insertion point on ModuleTranslation stack for use in
     // nested regions.
     LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
         moduleTranslation, allocaIP);
 
     // ParallelOp has only one region associated with it.
-    convertOmpOpRegions(opInst.getRegion(), "omp.par.region",
-                        *codeGenIP.getBlock(), continuationBlock, builder,
+    builder.restoreIP(codeGenIP);
+    convertOmpOpRegions(opInst.getRegion(), "omp.par.region", builder,
                         moduleTranslation, bodyGenStatus);
   };
 
@@ -308,12 +314,11 @@
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
-                       llvm::BasicBlock &continuationBlock) {
+  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // MasterOp has only one region associated with it.
     auto &region = cast<omp::MasterOp>(opInst).getRegion();
-    convertOmpOpRegions(region, "omp.master.region", *codeGenIP.getBlock(),
-                        continuationBlock, builder, moduleTranslation,
+    builder.restoreIP(codeGenIP);
+    convertOmpOpRegions(region, "omp.master.region", builder, moduleTranslation,
                         bodyGenStatus);
   };
 
@@ -337,13 +342,12 @@
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
-                       llvm::BasicBlock &continuationBlock) {
+  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // CriticalOp has only one region associated with it.
     auto &region = cast<omp::CriticalOp>(opInst).getRegion();
-    convertOmpOpRegions(region, "omp.critical.region", *codeGenIP.getBlock(),
-                        continuationBlock, builder, moduleTranslation,
-                        bodyGenStatus);
+    builder.restoreIP(codeGenIP);
+    convertOmpOpRegions(region, "omp.critical.region", builder,
+                        moduleTranslation, bodyGenStatus);
   };
 
   // TODO: Perform finalization actions for variables. This has to be
@@ -438,19 +442,10 @@
     return success();
   }
 
-  // Create the continuation block manually instead of calling splitBlock
-  // because the current insertion block may not have a terminator.
-  llvm::BasicBlock *continuationBlock =
-      llvm::BasicBlock::Create(builder.getContext(), blockName + ".cont",
-                               builder.GetInsertBlock()->getParent(),
-                               builder.GetInsertBlock()->getNextNode());
-  builder.CreateBr(continuationBlock);
-
   LogicalResult bodyGenStatus = success();
   SmallVector<llvm::PHINode *> phis;
-  convertOmpOpRegions(region, blockName, *builder.GetInsertBlock(),
-                      *continuationBlock, builder, moduleTranslation,
-                      bodyGenStatus, &phis);
+  llvm::BasicBlock *continuationBlock = convertOmpOpRegions(
+      region, blockName, builder, moduleTranslation, bodyGenStatus, &phis);
   if (failed(bodyGenStatus))
     return failure();
   if (continuationBlockArgs)
@@ -578,13 +573,12 @@
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
-                       llvm::BasicBlock &continuationBlock) {
+  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // OrderedOp has only one region associated with it.
     auto &region = cast<omp::OrderedRegionOp>(opInst).getRegion();
-    convertOmpOpRegions(region, "omp.ordered.region", *codeGenIP.getBlock(),
-                        continuationBlock, builder, moduleTranslation,
-                        bodyGenStatus);
+    builder.restoreIP(codeGenIP);
+    convertOmpOpRegions(region, "omp.ordered.region", builder,
+                        moduleTranslation, bodyGenStatus);
   };
 
   // TODO: Perform finalization actions for variables. This has to be
@@ -626,12 +620,10 @@
 
     Region &region = sectionOp.region();
     auto sectionCB = [&region, &builder, &moduleTranslation, &bodyGenStatus](
-                         InsertPointTy allocaIP, InsertPointTy codeGenIP,
-                         llvm::BasicBlock &finiBB) {
+                         InsertPointTy allocaIP, InsertPointTy codeGenIP) {
       builder.restoreIP(codeGenIP);
-      builder.CreateBr(&finiBB);
-      convertOmpOpRegions(region, "omp.section.region", *codeGenIP.getBlock(),
-                          finiBB, builder, moduleTranslation, bodyGenStatus);
+      convertOmpOpRegions(region, "omp.section.region", builder,
+                          moduleTranslation, bodyGenStatus);
     };
     sectionCBs.push_back(sectionCB);
   }
@@ -674,10 +666,9 @@
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   LogicalResult bodyGenStatus = success();
-  auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP,
-                    llvm::BasicBlock &continuationBB) {
-    convertOmpOpRegions(singleOp.region(), "omp.single.region",
-                        *codegenIP.getBlock(), continuationBB, builder,
+  auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
+    builder.restoreIP(codegenIP);
+    convertOmpOpRegions(singleOp.region(), "omp.single.region", builder,
                         moduleTranslation, bodyGenStatus);
   };
   auto finiCB = [&](InsertPointTy codeGenIP) {};
@@ -783,11 +774,9 @@
       return;
 
     // Convert the body of the loop.
-    llvm::BasicBlock *entryBlock = ip.getBlock();
-    llvm::BasicBlock *exitBlock =
-        entryBlock->splitBasicBlock(ip.getPoint(), "omp.wsloop.exit");
-    convertOmpOpRegions(loop.region(), "omp.wsloop.region", *entryBlock,
-                        *exitBlock, builder, moduleTranslation, bodyGenStatus);
+    builder.restoreIP(ip);
+    convertOmpOpRegions(loop.region(), "omp.wsloop.region", builder,
+                        moduleTranslation, bodyGenStatus);
   };
 
   // Delegate actual loop construction to the OpenMP IRBuilder.
@@ -922,11 +911,9 @@
       return;
 
     // Convert the body of the loop.
-    llvm::BasicBlock *entryBlock = ip.getBlock();
-    llvm::BasicBlock *exitBlock =
-        entryBlock->splitBasicBlock(ip.getPoint(), "omp.simdloop.exit");
-    convertOmpOpRegions(loop.region(), "omp.simdloop.region", *entryBlock,
-                        *exitBlock, builder, moduleTranslation, bodyGenStatus);
+    builder.restoreIP(ip);
+    convertOmpOpRegions(loop.region(), "omp.simdloop.region", builder,
+                        moduleTranslation, bodyGenStatus);
   };
 
   // Delegate actual loop construction to the OpenMP IRBuilder.
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -1147,11 +1147,10 @@
 
   omp.wsloop ordered(0)
   for (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) {
-    // CHECK: [[OMP_THREAD:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-    // CHECK-NEXT:  call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_THREAD]])
+    // CHECK:  call void @__kmpc_ordered(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[OMP_THREAD2:%.*]])
     omp.ordered_region  {
       omp.terminator
-    // CHECK: call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB1]], i32 [[OMP_THREAD]])
+    // CHECK: call void @__kmpc_end_ordered(%struct.ident_t* @[[GLOB3]], i32 [[OMP_THREAD2]])
     }
     omp.yield
   }
@@ -1963,37 +1962,37 @@
   // CHECK-NEXT:     i32 1, label %[[SECTION2:.*]]
   // CHECK-NEXT: ]
 
-  // CHECK: [[INC]]:
-  // CHECK:   %{{.*}} = add {{.*}}, 1
-  // CHECK:   br label %[[HEADER]]
-
-  // CHECK: [[EXIT]]:
-  // CHECK:   call void @__kmpc_for_static_fini({{.*}})
-  // CHECK:   call void @__kmpc_barrier({{.*}})
-  // CHECK:   br label %[[AFTER:.*]]
-
-  // CHECK: [[AFTER]]:
-  // CHECK:   br label %[[END:.*]]
-
-  // CHECK: [[END]]:
-  // CHECK:   ret void
   omp.sections {
     omp.section {
       // CHECK: [[SECTION1]]:
-      // CHECK-NEXT: br label %[[REGION1:[^ ,]*]]
-      // CHECK: [[REGION1]]:
-      // CHECK-NEXT: br label %[[EXIT]]
+      // CHECK-NEXT: br label %[[SECTION1_REGION1:[^ ,]*]]
+      // CHECK-EMPTY:
+      // CHECK-NEXT: [[SECTION1_REGION1]]:
+      // CHECK-NEXT: br label %[[SECTION1_REGION2:[^ ,]*]]
+      // CHECK-EMPTY:
+      // CHECK-NEXT: [[SECTION1_REGION2]]:
+      // CHECK-NEXT: br label %[[INC]]
       omp.terminator
     }
     omp.section {
       // CHECK: [[SECTION2]]:
-      // CHECK-NEXT: br label %[[REGION2:[^ ,]*]]
-      // CHECK: [[REGION2]]:
-      // CHECK-NEXT: br label %[[EXIT]]
+      // CHECK: br label %[[INC]]
       omp.terminator
     }
     omp.terminator
   }
+
+  // CHECK: [[INC]]:
+  // CHECK:   %{{.*}} = add {{.*}}, 1
+  // CHECK:   br label %[[HEADER]]
+
+  // CHECK: [[EXIT]]:
+  // CHECK:   call void @__kmpc_for_static_fini({{.*}})
+  // CHECK:   call void @__kmpc_barrier({{.*}})
+  // CHECK:   br label %[[AFTER:.*]]
+
+  // CHECK: [[AFTER]]:
+  // CHECK:   ret void
   llvm.return
 }